Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / pylint / checkers / strings.py @ 745

History | View | Annotate | Download (26.5 KB)

1
# Copyright (c) 2009-2010 Arista Networks, Inc. - James Lingard
2
# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
3
# Copyright 2012 Google Inc.
4
#
5
# http://www.logilab.fr/ -- mailto:contact@logilab.fr
6
# This program is free software; you can redistribute it and/or modify it under
7
# the terms of the GNU General Public License as published by the Free Software
8
# Foundation; either version 2 of the License, or (at your option) any later
9
# version.
10
#
11
# This program is distributed in the hope that it will be useful, but WITHOUT
12
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
14
#
15
# You should have received a copy of the GNU General Public License along with
16
# this program; if not, write to the Free Software Foundation, Inc.,
17
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18
"""Checker for string formatting operations.
19
"""
20

    
21
import sys
22
import tokenize
23
import string
24
import numbers
25

    
26
import six
27

    
28
import astroid
29
from pylint.interfaces import ITokenChecker, IAstroidChecker, IRawChecker
30
from pylint.checkers import BaseChecker, BaseTokenChecker
31
from pylint.checkers import utils
32
from pylint.checkers.utils import check_messages
33

    
34

    
35
_PY3K = sys.version_info[:2] >= (3, 0)
36
_PY27 = sys.version_info[:2] == (2, 7)
37

    
38
MSGS = {
39
    'E1300': ("Unsupported format character %r (%#02x) at index %d",
40
              "bad-format-character",
41
              "Used when a unsupported format character is used in a format\
42
              string."),
43
    'E1301': ("Format string ends in middle of conversion specifier",
44
              "truncated-format-string",
45
              "Used when a format string terminates before the end of a \
46
              conversion specifier."),
47
    'E1302': ("Mixing named and unnamed conversion specifiers in format string",
48
              "mixed-format-string",
49
              "Used when a format string contains both named (e.g. '%(foo)d') \
50
              and unnamed (e.g. '%d') conversion specifiers.  This is also \
51
              used when a named conversion specifier contains * for the \
52
              minimum field width and/or precision."),
53
    'E1303': ("Expected mapping for format string, not %s",
54
              "format-needs-mapping",
55
              "Used when a format string that uses named conversion specifiers \
56
              is used with an argument that is not a mapping."),
57
    'W1300': ("Format string dictionary key should be a string, not %s",
58
              "bad-format-string-key",
59
              "Used when a format string that uses named conversion specifiers \
60
              is used with a dictionary whose keys are not all strings."),
61
    'W1301': ("Unused key %r in format string dictionary",
62
              "unused-format-string-key",
63
              "Used when a format string that uses named conversion specifiers \
64
              is used with a dictionary that conWtains keys not required by the \
65
              format string."),
66
    'E1304': ("Missing key %r in format string dictionary",
67
              "missing-format-string-key",
68
              "Used when a format string that uses named conversion specifiers \
69
              is used with a dictionary that doesn't contain all the keys \
70
              required by the format string."),
71
    'E1305': ("Too many arguments for format string",
72
              "too-many-format-args",
73
              "Used when a format string that uses unnamed conversion \
74
              specifiers is given too many arguments."),
75
    'E1306': ("Not enough arguments for format string",
76
              "too-few-format-args",
77
              "Used when a format string that uses unnamed conversion \
78
              specifiers is given too few arguments"),
79
    'E1310': ("Suspicious argument in %s.%s call",
80
              "bad-str-strip-call",
81
              "The argument to a str.{l,r,}strip call contains a"
82
              " duplicate character, "),
83
    'W1302': ("Invalid format string",
84
              "bad-format-string",
85
              "Used when a PEP 3101 format string is invalid.",
86
              {'minversion': (2, 7)}),
87
    'W1303': ("Missing keyword argument %r for format string",
88
              "missing-format-argument-key",
89
              "Used when a PEP 3101 format string that uses named fields "
90
              "doesn't receive one or more required keywords.",
91
              {'minversion': (2, 7)}),
92
    'W1304': ("Unused format argument %r",
93
              "unused-format-string-argument",
94
              "Used when a PEP 3101 format string that uses named "
95
              "fields is used with an argument that "
96
              "is not required by the format string.",
97
              {'minversion': (2, 7)}),
98
    'W1305': ("Format string contains both automatic field numbering "
99
              "and manual field specification",
100
              "format-combined-specification",
101
              "Usen when a PEP 3101 format string contains both automatic "
102
              "field numbering (e.g. '{}') and manual field "
103
              "specification (e.g. '{0}').",
104
              {'minversion': (2, 7)}),
105
    'W1306': ("Missing format attribute %r in format specifier %r",
106
              "missing-format-attribute",
107
              "Used when a PEP 3101 format string uses an "
108
              "attribute specifier ({0.length}), but the argument "
109
              "passed for formatting doesn't have that attribute.",
110
              {'minversion': (2, 7)}),
111
    'W1307': ("Using invalid lookup key %r in format specifier %r",
112
              "invalid-format-index",
113
              "Used when a PEP 3101 format string uses a lookup specifier "
114
              "({a[1]}), but the argument passed for formatting "
115
              "doesn't contain or doesn't have that key as an attribute.",
116
              {'minversion': (2, 7)})
117
    }
118

    
119
OTHER_NODES = (astroid.Const, astroid.List, astroid.Repr,
120
               astroid.Lambda, astroid.FunctionDef,
121
               astroid.ListComp, astroid.SetComp, astroid.GeneratorExp)
122

    
123
if _PY3K:
124
    import _string # pylint: disable=wrong-import-position, wrong-import-order
125

    
126
    def split_format_field_names(format_string):
127
        return _string.formatter_field_name_split(format_string)
128
else:
129
    def _field_iterator_convertor(iterator):
130
        for is_attr, key in iterator:
131
            if isinstance(key, numbers.Number):
132
                yield is_attr, int(key)
133
            else:
134
                yield is_attr, key
135

    
136
    def split_format_field_names(format_string):
137
        keyname, fielditerator = format_string._formatter_field_name_split()
138
        # it will return longs, instead of ints, which will complicate
139
        # the output
140
        return keyname, _field_iterator_convertor(fielditerator)
141

    
142

    
143
def collect_string_fields(format_string):
144
    """ Given a format string, return an iterator
145
    of all the valid format fields. It handles nested fields
146
    as well.
147
    """
148

    
149
    formatter = string.Formatter()
150
    try:
151
        parseiterator = formatter.parse(format_string)
152
        for result in parseiterator:
153
            if all(item is None for item in result[1:]):
154
                # not a replacement format
155
                continue
156
            name = result[1]
157
            nested = result[2]
158
            yield name
159
            if nested:
160
                for field in collect_string_fields(nested):
161
                    yield field
162
    except ValueError as exc:
163
        # Probably the format string is invalid.
164
        if exc.args[0].startswith("cannot switch from manual"):
165
            # On Jython, parsing a string with both manual
166
            # and automatic positions will fail with a ValueError,
167
            # while on CPython it will simply return the fields,
168
            # the validation being done in the interpreter (?).
169
            # We're just returning two mixed fields in order
170
            # to trigger the format-combined-specification check.
171
            yield ""
172
            yield "1"
173
            return
174
        raise utils.IncompleteFormatString(format_string)
175

    
176
def parse_format_method_string(format_string):
177
    """
178
    Parses a PEP 3101 format string, returning a tuple of
179
    (keys, num_args, manual_pos_arg),
180
    where keys is the set of mapping keys in the format string, num_args
181
    is the number of arguments required by the format string and
182
    manual_pos_arg is the number of arguments passed with the position.
183
    """
184
    keys = []
185
    num_args = 0
186
    manual_pos_arg = set()
187
    for name in collect_string_fields(format_string):
188
        if name and str(name).isdigit():
189
            manual_pos_arg.add(str(name))
190
        elif name:
191
            keyname, fielditerator = split_format_field_names(name)
192
            if isinstance(keyname, numbers.Number):
193
                # In Python 2 it will return long which will lead
194
                # to different output between 2 and 3
195
                manual_pos_arg.add(str(keyname))
196
                keyname = int(keyname)
197
            keys.append((keyname, list(fielditerator)))
198
        else:
199
            num_args += 1
200
    return keys, num_args, len(manual_pos_arg)
201

    
202
def get_args(callfunc):
203
    """Get the arguments from the given `CallFunc` node.
204

205
    Return a tuple, where the first element is the
206
    number of positional arguments and the second element
207
    is the keyword arguments in a dict.
208
    """
209
    if callfunc.keywords:
210
        named = {arg.arg: utils.safe_infer(arg.value)
211
                 for arg in callfunc.keywords}
212
    else:
213
        named = {}
214
    positional = len(callfunc.args)
215
    return positional, named
216

    
217
def get_access_path(key, parts):
218
    """ Given a list of format specifiers, returns
219
    the final access path (e.g. a.b.c[0][1]).
220
    """
221
    path = []
222
    for is_attribute, specifier in parts:
223
        if is_attribute:
224
            path.append(".{}".format(specifier))
225
        else:
226
            path.append("[{!r}]".format(specifier))
227
    return str(key) + "".join(path)
228

    
229

    
230
class StringFormatChecker(BaseChecker):
231
    """Checks string formatting operations to ensure that the format string
232
    is valid and the arguments match the format string.
233
    """
234

    
235
    __implements__ = (IAstroidChecker,)
236
    name = 'string'
237
    msgs = MSGS
238

    
239
    @check_messages(*(MSGS.keys()))
240
    def visit_binop(self, node):
241
        if node.op != '%':
242
            return
243
        left = node.left
244
        args = node.right
245

    
246
        if not (isinstance(left, astroid.Const)
247
                and isinstance(left.value, six.string_types)):
248
            return
249
        format_string = left.value
250
        try:
251
            required_keys, required_num_args = \
252
                utils.parse_format_string(format_string)
253
        except utils.UnsupportedFormatCharacter as e:
254
            c = format_string[e.index]
255
            self.add_message('bad-format-character',
256
                             node=node, args=(c, ord(c), e.index))
257
            return
258
        except utils.IncompleteFormatString:
259
            self.add_message('truncated-format-string', node=node)
260
            return
261
        if required_keys and required_num_args:
262
            # The format string uses both named and unnamed format
263
            # specifiers.
264
            self.add_message('mixed-format-string', node=node)
265
        elif required_keys:
266
            # The format string uses only named format specifiers.
267
            # Check that the RHS of the % operator is a mapping object
268
            # that contains precisely the set of keys required by the
269
            # format string.
270
            if isinstance(args, astroid.Dict):
271
                keys = set()
272
                unknown_keys = False
273
                for k, _ in args.items:
274
                    if isinstance(k, astroid.Const):
275
                        key = k.value
276
                        if isinstance(key, six.string_types):
277
                            keys.add(key)
278
                        else:
279
                            self.add_message('bad-format-string-key',
280
                                             node=node, args=key)
281
                    else:
282
                        # One of the keys was something other than a
283
                        # constant.  Since we can't tell what it is,
284
                        # supress checks for missing keys in the
285
                        # dictionary.
286
                        unknown_keys = True
287
                if not unknown_keys:
288
                    for key in required_keys:
289
                        if key not in keys:
290
                            self.add_message('missing-format-string-key',
291
                                             node=node, args=key)
292
                for key in keys:
293
                    if key not in required_keys:
294
                        self.add_message('unused-format-string-key',
295
                                         node=node, args=key)
296
            elif isinstance(args, OTHER_NODES + (astroid.Tuple,)):
297
                type_name = type(args).__name__
298
                self.add_message('format-needs-mapping',
299
                                 node=node, args=type_name)
300
            # else:
301
                # The RHS of the format specifier is a name or
302
                # expression.  It may be a mapping object, so
303
                # there's nothing we can check.
304
        else:
305
            # The format string uses only unnamed format specifiers.
306
            # Check that the number of arguments passed to the RHS of
307
            # the % operator matches the number required by the format
308
            # string.
309
            if isinstance(args, astroid.Tuple):
310
                num_args = len(args.elts)
311
            elif isinstance(args, OTHER_NODES + (astroid.Dict, astroid.DictComp)):
312
                num_args = 1
313
            else:
314
                # The RHS of the format specifier is a name or
315
                # expression.  It could be a tuple of unknown size, so
316
                # there's nothing we can check.
317
                num_args = None
318
            if num_args is not None:
319
                if num_args > required_num_args:
320
                    self.add_message('too-many-format-args', node=node)
321
                elif num_args < required_num_args:
322
                    self.add_message('too-few-format-args', node=node)
323

    
324

    
325
    @check_messages(*(MSGS.keys()))
326
    def visit_call(self, node):
327
        func = utils.safe_infer(node.func)
328
        if (isinstance(func, astroid.BoundMethod)
329
                and isinstance(func.bound, astroid.Instance)
330
                and func.bound.name in ('str', 'unicode', 'bytes')):
331
            if func.name in ('strip', 'lstrip', 'rstrip') and node.args:
332
                arg = utils.safe_infer(node.args[0])
333
                if not isinstance(arg, astroid.Const):
334
                    return
335
                if len(arg.value) != len(set(arg.value)):
336
                    self.add_message('bad-str-strip-call', node=node,
337
                                     args=(func.bound.name, func.name))
338
            elif func.name == 'format':
339
                if _PY27 or _PY3K:
340
                    self._check_new_format(node, func)
341

    
342
    def _check_new_format(self, node, func):
343
        """ Check the new string formatting. """
344
        # TODO: skip (for now) format nodes which don't have
345
        #       an explicit string on the left side of the format operation.
346
        #       We do this because our inference engine can't properly handle
347
        #       redefinitions of the original string.
348
        #       For more details, see issue 287.
349
        #
350
        # Note that there may not be any left side at all, if the format method
351
        # has been assigned to another variable. See issue 351. For example:
352
        #
353
        #    fmt = 'some string {}'.format
354
        #    fmt('arg')
355
        if (isinstance(node.func, astroid.Attribute)
356
                and not isinstance(node.func.expr, astroid.Const)):
357
            return
358
        try:
359
            strnode = next(func.bound.infer())
360
        except astroid.InferenceError:
361
            return
362
        if not isinstance(strnode, astroid.Const):
363
            return
364
        if not isinstance(strnode.value, six.string_types):
365
            return
366

    
367
        if node.starargs or node.kwargs:
368
            return
369
        try:
370
            positional, named = get_args(node)
371
        except astroid.InferenceError:
372
            return
373
        try:
374
            fields, num_args, manual_pos = parse_format_method_string(strnode.value)
375
        except utils.IncompleteFormatString:
376
            self.add_message('bad-format-string', node=node)
377
            return
378

    
379
        named_fields = set(field[0] for field in fields
380
                           if isinstance(field[0], six.string_types))
381
        if num_args and manual_pos:
382
            self.add_message('format-combined-specification',
383
                             node=node)
384
            return
385

    
386
        check_args = False
387
        # Consider "{[0]} {[1]}" as num_args.
388
        num_args += sum(1 for field in named_fields
389
                        if field == '')
390
        if named_fields:
391
            for field in named_fields:
392
                if field not in named and field:
393
                    self.add_message('missing-format-argument-key',
394
                                     node=node,
395
                                     args=(field, ))
396
            for field in named:
397
                if field not in named_fields:
398
                    self.add_message('unused-format-string-argument',
399
                                     node=node,
400
                                     args=(field, ))
401
            # num_args can be 0 if manual_pos is not.
402
            num_args = num_args or manual_pos
403
            if positional or num_args:
404
                empty = any(True for field in named_fields
405
                            if field == '')
406
                if named or empty:
407
                    # Verify the required number of positional arguments
408
                    # only if the .format got at least one keyword argument.
409
                    # This means that the format strings accepts both
410
                    # positional and named fields and we should warn
411
                    # when one of the them is missing or is extra.
412
                    check_args = True
413
        else:
414
            check_args = True
415
        if check_args:
416
            # num_args can be 0 if manual_pos is not.
417
            num_args = num_args or manual_pos
418
            if positional > num_args:
419
                self.add_message('too-many-format-args', node=node)
420
            elif positional < num_args:
421
                self.add_message('too-few-format-args', node=node)
422

    
423
        self._check_new_format_specifiers(node, fields, named)
424

    
425
    def _check_new_format_specifiers(self, node, fields, named):
426
        """
427
        Check attribute and index access in the format
428
        string ("{0.a}" and "{0[a]}").
429
        """
430
        for key, specifiers in fields:
431
            # Obtain the argument. If it can't be obtained
432
            # or infered, skip this check.
433
            if key == '':
434
                # {[0]} will have an unnamed argument, defaulting
435
                # to 0. It will not be present in `named`, so use the value
436
                # 0 for it.
437
                key = 0
438
            if isinstance(key, numbers.Number):
439
                try:
440
                    argname = utils.get_argument_from_call(node, key)
441
                except utils.NoSuchArgumentError:
442
                    continue
443
            else:
444
                if key not in named:
445
                    continue
446
                argname = named[key]
447
            if argname in (astroid.YES, None):
448
                continue
449
            try:
450
                argument = next(argname.infer())
451
            except astroid.InferenceError:
452
                continue
453
            if not specifiers or argument is astroid.YES:
454
                # No need to check this key if it doesn't
455
                # use attribute / item access
456
                continue
457
            if argument.parent and isinstance(argument.parent, astroid.Arguments):
458
                # Ignore any object coming from an argument,
459
                # because we can't infer its value properly.
460
                continue
461
            previous = argument
462
            parsed = []
463
            for is_attribute, specifier in specifiers:
464
                if previous is astroid.YES:
465
                    break
466
                parsed.append((is_attribute, specifier))
467
                if is_attribute:
468
                    try:
469
                        previous = previous.getattr(specifier)[0]
470
                    except astroid.NotFoundError:
471
                        if (hasattr(previous, 'has_dynamic_getattr') and
472
                                previous.has_dynamic_getattr()):
473
                            # Don't warn if the object has a custom __getattr__
474
                            break
475
                        path = get_access_path(key, parsed)
476
                        self.add_message('missing-format-attribute',
477
                                         args=(specifier, path),
478
                                         node=node)
479
                        break
480
                else:
481
                    warn_error = False
482
                    if hasattr(previous, 'getitem'):
483
                        try:
484
                            previous = previous.getitem(specifier)
485
                        except (IndexError, TypeError):
486
                            warn_error = True
487
                        except astroid.InferenceError:
488
                            break
489
                    else:
490
                        try:
491
                            # Lookup __getitem__ in the current node,
492
                            # but skip further checks, because we can't
493
                            # retrieve the looked object
494
                            previous.getattr('__getitem__')
495
                            break
496
                        except astroid.NotFoundError:
497
                            warn_error = True
498
                    if warn_error:
499
                        path = get_access_path(key, parsed)
500
                        self.add_message('invalid-format-index',
501
                                         args=(specifier, path),
502
                                         node=node)
503
                        break
504

    
505
                try:
506
                    previous = next(previous.infer())
507
                except astroid.InferenceError:
508
                    # can't check further if we can't infer it
509
                    break
510

    
511

    
512

    
513
class StringConstantChecker(BaseTokenChecker):
514
    """Check string literals"""
515
    __implements__ = (ITokenChecker, IRawChecker)
516
    name = 'string_constant'
517
    msgs = {
518
        'W1401': ('Anomalous backslash in string: \'%s\'. '
519
                  'String constant might be missing an r prefix.',
520
                  'anomalous-backslash-in-string',
521
                  'Used when a backslash is in a literal string but not as an '
522
                  'escape.'),
523
        'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
524
                  'String constant might be missing an r or u prefix.',
525
                  'anomalous-unicode-escape-in-string',
526
                  'Used when an escape like \\u is encountered in a byte '
527
                  'string where it has no effect.'),
528
        }
529

    
530
    # Characters that have a special meaning after a backslash in either
531
    # Unicode or byte strings.
532
    ESCAPE_CHARACTERS = 'abfnrtvx\n\r\t\\\'\"01234567'
533

    
534
    # TODO(mbp): Octal characters are quite an edge case today; people may
535
    # prefer a separate warning where they occur.  \0 should be allowed.
536

    
537
    # Characters that have a special meaning after a backslash but only in
538
    # Unicode strings.
539
    UNICODE_ESCAPE_CHARACTERS = 'uUN'
540

    
541
    def process_module(self, module):
542
        self._unicode_literals = 'unicode_literals' in module.future_imports
543

    
544
    def process_tokens(self, tokens):
545
        for (tok_type, token, (start_row, _), _, _) in tokens:
546
            if tok_type == tokenize.STRING:
547
                # 'token' is the whole un-parsed token; we can look at the start
548
                # of it to see whether it's a raw or unicode string etc.
549
                self.process_string_token(token, start_row)
550

    
551
    def process_string_token(self, token, start_row):
552
        for i, c in enumerate(token):
553
            if c in '\'\"':
554
                quote_char = c
555
                break
556
        # pylint: disable=undefined-loop-variable
557
        prefix = token[:i].lower() #  markers like u, b, r.
558
        after_prefix = token[i:]
559
        if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
560
            string_body = after_prefix[3:-3]
561
        else:
562
            string_body = after_prefix[1:-1]  # Chop off quotes
563
        # No special checks on raw strings at the moment.
564
        if 'r' not in prefix:
565
            self.process_non_raw_string_token(prefix, string_body, start_row)
566

    
567
    def process_non_raw_string_token(self, prefix, string_body, start_row):
568
        """check for bad escapes in a non-raw string.
569

570
        prefix: lowercase string of eg 'ur' string prefix markers.
571
        string_body: the un-parsed body of the string, not including the quote
572
        marks.
573
        start_row: integer line number in the source.
574
        """
575
        # Walk through the string; if we see a backslash then escape the next
576
        # character, and skip over it.  If we see a non-escaped character,
577
        # alert, and continue.
578
        #
579
        # Accept a backslash when it escapes a backslash, or a quote, or
580
        # end-of-line, or one of the letters that introduce a special escape
581
        # sequence <http://docs.python.org/reference/lexical_analysis.html>
582
        #
583
        # TODO(mbp): Maybe give a separate warning about the rarely-used
584
        # \a \b \v \f?
585
        #
586
        # TODO(mbp): We could give the column of the problem character, but
587
        # add_message doesn't seem to have a way to pass it through at present.
588
        i = 0
589
        while True:
590
            i = string_body.find('\\', i)
591
            if i == -1:
592
                break
593
            # There must be a next character; having a backslash at the end
594
            # of the string would be a SyntaxError.
595
            next_char = string_body[i+1]
596
            match = string_body[i:i+2]
597
            if next_char in self.UNICODE_ESCAPE_CHARACTERS:
598
                if 'u' in prefix:
599
                    pass
600
                elif (_PY3K or self._unicode_literals) and 'b' not in prefix:
601
                    pass  # unicode by default
602
                else:
603
                    self.add_message('anomalous-unicode-escape-in-string',
604
                                     line=start_row, args=(match, ))
605
            elif next_char not in self.ESCAPE_CHARACTERS:
606
                self.add_message('anomalous-backslash-in-string',
607
                                 line=start_row, args=(match, ))
608
            # Whether it was a valid escape or not, backslash followed by
609
            # another character can always be consumed whole: the second
610
            # character can never be the start of a new backslash escape.
611
            i += 2
612

    
613

    
614

    
615
def register(linter):
616
    """required method to auto register this checker """
617
    linter.register_checker(StringFormatChecker(linter))
618
    linter.register_checker(StringConstantChecker(linter))