Package lxml :: Module cssselect
[hide private]
[frames] | no frames]

Source Code for Module lxml.cssselect

  1  """CSS Selectors based on XPath. 
  2   
  3  This module supports selecting XML/HTML tags based on CSS selectors. 
  4  See the `CSSSelector` class for details. 
  5  """ 
  6   
  7  import re 
  8  from lxml import etree 
  9   
 10  __all__ = ['SelectorSyntaxError', 'ExpressionError', 
 11             'CSSSelector'] 
 12   
 13  try: 
 14      _basestring = basestring 
 15  except NameError: 
 16      _basestring = str 
 17   
18 -class SelectorSyntaxError(SyntaxError):
19 pass
20
21 -class ExpressionError(RuntimeError):
22 pass
23
24 -class CSSSelector(etree.XPath):
25 """A CSS selector. 26 27 Usage:: 28 29 >>> from lxml import etree, cssselect 30 >>> select = cssselect.CSSSelector("a tag > child") 31 32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 33 >>> [ el.tag for el in select(root) ] 34 ['child'] 35 """
36 - def __init__(self, css):
37 path = css_to_xpath(css) 38 etree.XPath.__init__(self, path) 39 self.css = css
40
41 - def __repr__(self):
42 return '<%s %s for %r>' % ( 43 self.__class__.__name__, 44 hex(abs(id(self)))[2:], 45 self.css)
46 47 ############################## 48 ## Token objects: 49 50 try: 51 _unicode = unicode 52 except NameError: 53 # Python 3 54 _unicode = str 55
56 -class _UniToken(_unicode):
57 - def __new__(cls, contents, pos):
58 obj = _unicode.__new__(cls, contents) 59 obj.pos = pos 60 return obj
61
62 - def __repr__(self):
63 return '%s(%s, %r)' % ( 64 self.__class__.__name__, 65 _unicode.__repr__(self), 66 self.pos)
67
68 -class Symbol(_UniToken):
69 pass
70
71 -class String(_UniToken):
72 pass
73
74 -class Token(_UniToken):
75 pass
76 77 ############################################################ 78 ## Parsing 79 ############################################################ 80 81 ############################## 82 ## Syntax objects: 83
84 -class Class(object):
85 """ 86 Represents selector.class_name 87 """ 88
89 - def __init__(self, selector, class_name):
90 self.selector = selector 91 self.class_name = class_name
92
93 - def __repr__(self):
94 return '%s[%r.%s]' % ( 95 self.__class__.__name__, 96 self.selector, 97 self.class_name)
98
99 - def xpath(self):
100 sel_xpath = self.selector.xpath() 101 sel_xpath.add_condition( 102 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) 103 return sel_xpath
104
105 -class Function(object):
106 """ 107 Represents selector:name(expr) 108 """ 109 110 unsupported = [ 111 'target', 'lang', 'enabled', 'disabled',] 112
113 - def __init__(self, selector, type, name, expr):
114 self.selector = selector 115 self.type = type 116 self.name = name 117 self.expr = expr
118
119 - def __repr__(self):
120 return '%s[%r%s%s(%r)]' % ( 121 self.__class__.__name__, 122 self.selector, 123 self.type, self.name, self.expr)
124
125 - def xpath(self):
126 sel_path = self.selector.xpath() 127 if self.name in self.unsupported: 128 raise ExpressionError( 129 "The psuedo-class %r is not supported" % self.name) 130 method = '_xpath_' + self.name.replace('-', '_') 131 if not hasattr(self, method): 132 raise ExpressionError( 133 "The psuedo-class %r is unknown" % self.name) 134 method = getattr(self, method) 135 return method(sel_path, self.expr)
136
137 - def _xpath_nth_child(self, xpath, expr, last=False, 138 add_name_test=True):
139 a, b = parse_series(expr) 140 if not a and not b and not last: 141 # a=0 means nothing is returned... 142 xpath.add_condition('false() and position() = 0') 143 return xpath 144 if add_name_test: 145 xpath.add_name_test() 146 xpath.add_star_prefix() 147 if a == 0: 148 if last: 149 b = 'last() - %s' % b 150 xpath.add_condition('position() = %s' % b) 151 return xpath 152 if last: 153 # FIXME: I'm not sure if this is right 154 a = -a 155 b = -b 156 if b > 0: 157 b_neg = str(-b) 158 else: 159 b_neg = '+%s' % (-b) 160 if a != 1: 161 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 162 else: 163 expr = [] 164 if b >= 0: 165 expr.append('position() >= %s' % b) 166 elif b < 0 and last: 167 expr.append('position() < (last() %s)' % b) 168 expr = ' and '.join(expr) 169 if expr: 170 xpath.add_condition(expr) 171 return xpath
172 # FIXME: handle an+b, odd, even 173 # an+b means every-a, plus b, e.g., 2n+1 means odd 174 # 0n+b means b 175 # n+0 means a=1, i.e., all elements 176 # an means every a elements, i.e., 2n means even 177 # -n means -1n 178 # -1n+6 means elements 6 and previous 179
180 - def _xpath_nth_last_child(self, xpath, expr):
181 return self._xpath_nth_child(xpath, expr, last=True)
182
183 - def _xpath_nth_of_type(self, xpath, expr):
184 if xpath.element == '*': 185 raise NotImplementedError( 186 "*:nth-of-type() is not implemented") 187 return self._xpath_nth_child(xpath, expr, add_name_test=False)
188
189 - def _xpath_nth_last_of_type(self, xpath, expr):
190 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
191
192 - def _xpath_contains(self, xpath, expr):
193 # text content, minus tags, must contain expr 194 if isinstance(expr, Element): 195 expr = expr._format_element() 196 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 197 % xpath_repr(expr.lower())) 198 # FIXME: Currently case insensitive matching doesn't seem to be happening 199 return xpath
200
201 - def _xpath_not(self, xpath, expr):
202 # everything for which not expr applies 203 expr = expr.xpath() 204 cond = expr.condition 205 # FIXME: should I do something about element_path? 206 xpath.add_condition('not(%s)' % cond) 207 return xpath
208
209 -def _make_lower_case(context, s):
210 return s.lower()
211 212 ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') 213 ns.prefix = 'css' 214 ns['lower-case'] = _make_lower_case 215
216 -class Pseudo(object):
217 """ 218 Represents selector:ident 219 """ 220 221 unsupported = ['indeterminate', 'first-line', 'first-letter', 222 'selection', 'before', 'after', 'link', 'visited', 223 'active', 'focus', 'hover'] 224
225 - def __init__(self, element, type, ident):
226 self.element = element 227 assert type in (':', '::') 228 self.type = type 229 self.ident = ident
230
231 - def __repr__(self):
232 return '%s[%r%s%s]' % ( 233 self.__class__.__name__, 234 self.element, 235 self.type, self.ident)
236
237 - def xpath(self):
238 el_xpath = self.element.xpath() 239 if self.ident in self.unsupported: 240 raise ExpressionError( 241 "The psuedo-class %r is unsupported" % self.ident) 242 method = '_xpath_' + self.ident.replace('-', '_') 243 if not hasattr(self, method): 244 raise ExpressionError( 245 "The psuedo-class %r is unknown" % self.ident) 246 method = getattr(self, method) 247 el_xpath = method(el_xpath) 248 return el_xpath
249
250 - def _xpath_checked(self, xpath):
251 # FIXME: is this really all the elements? 252 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 253 return xpath
254
255 - def _xpath_root(self, xpath):
256 # if this element is the root element 257 raise NotImplementedError
258
259 - def _xpath_first_child(self, xpath):
260 xpath.add_star_prefix() 261 xpath.add_name_test() 262 xpath.add_condition('position() = 1') 263 return xpath
264
265 - def _xpath_last_child(self, xpath):
266 xpath.add_star_prefix() 267 xpath.add_name_test() 268 xpath.add_condition('position() = last()') 269 return xpath
270
271 - def _xpath_first_of_type(self, xpath):
272 if xpath.element == '*': 273 raise NotImplementedError( 274 "*:first-of-type is not implemented") 275 xpath.add_star_prefix() 276 xpath.add_condition('position() = 1') 277 return xpath
278
279 - def _xpath_last_of_type(self, xpath):
280 if xpath.element == '*': 281 raise NotImplementedError( 282 "*:last-of-type is not implemented") 283 xpath.add_star_prefix() 284 xpath.add_condition('position() = last()') 285 return xpath
286
287 - def _xpath_only_child(self, xpath):
288 xpath.add_name_test() 289 xpath.add_star_prefix() 290 xpath.add_condition('last() = 1') 291 return xpath
292
293 - def _xpath_only_of_type(self, xpath):
294 if xpath.element == '*': 295 raise NotImplementedError( 296 "*:only-of-type is not implemented") 297 xpath.add_condition('last() = 1') 298 return xpath
299
300 - def _xpath_empty(self, xpath):
301 xpath.add_condition("not(*) and not(normalize-space())") 302 return xpath
303
304 -class Attrib(object):
305 """ 306 Represents selector[namespace|attrib operator value] 307 """ 308
309 - def __init__(self, selector, namespace, attrib, operator, value):
310 self.selector = selector 311 self.namespace = namespace 312 self.attrib = attrib 313 self.operator = operator 314 self.value = value
315
316 - def __repr__(self):
317 if self.operator == 'exists': 318 return '%s[%r[%s]]' % ( 319 self.__class__.__name__, 320 self.selector, 321 self._format_attrib()) 322 else: 323 return '%s[%r[%s %s %r]]' % ( 324 self.__class__.__name__, 325 self.selector, 326 self._format_attrib(), 327 self.operator, 328 self.value)
329
330 - def _format_attrib(self):
331 if self.namespace == '*': 332 return self.attrib 333 else: 334 return '%s|%s' % (self.namespace, self.attrib)
335
336 - def _xpath_attrib(self):
337 # FIXME: if attrib is *? 338 if self.namespace == '*': 339 return '@' + self.attrib 340 else: 341 return '@%s:%s' % (self.namespace, self.attrib)
342
343 - def xpath(self):
344 path = self.selector.xpath() 345 attrib = self._xpath_attrib() 346 value = self.value 347 if self.operator == 'exists': 348 assert not value 349 path.add_condition(attrib) 350 elif self.operator == '=': 351 path.add_condition('%s = %s' % (attrib, 352 xpath_repr(value))) 353 elif self.operator == '!=': 354 # FIXME: this seems like a weird hack... 355 if value: 356 path.add_condition('not(%s) or %s != %s' 357 % (attrib, attrib, xpath_repr(value))) 358 else: 359 path.add_condition('%s != %s' 360 % (attrib, xpath_repr(value))) 361 #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) 362 elif self.operator == '~=': 363 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) 364 elif self.operator == '|=': 365 # Weird, but true... 366 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 367 attrib, xpath_repr(value), 368 attrib, xpath_repr(value + '-'))) 369 elif self.operator == '^=': 370 path.add_condition('starts-with(%s, %s)' % ( 371 attrib, xpath_repr(value))) 372 elif self.operator == '$=': 373 # Oddly there is a starts-with in XPath 1.0, but not ends-with 374 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 375 % (attrib, attrib, len(value)-1, xpath_repr(value))) 376 elif self.operator == '*=': 377 # FIXME: case sensitive? 378 path.add_condition('contains(%s, %s)' % ( 379 attrib, xpath_repr(value))) 380 else: 381 assert 0, ("Unknown operator: %r" % self.operator) 382 return path
383
384 -class Element(object):
385 """ 386 Represents namespace|element 387 """ 388
389 - def __init__(self, namespace, element):
390 self.namespace = namespace 391 self.element = element
392
393 - def __repr__(self):
394 return '%s[%s]' % ( 395 self.__class__.__name__, 396 self._format_element())
397
398 - def _format_element(self):
399 if self.namespace == '*': 400 return self.element 401 else: 402 return '%s|%s' % (self.namespace, self.element)
403
404 - def xpath(self):
405 if self.namespace == '*': 406 el = self.element.lower() 407 else: 408 # FIXME: Should we lowercase here? 409 el = '%s:%s' % (self.namespace, self.element) 410 return XPathExpr(element=el)
411
412 -class Hash(object):
413 """ 414 Represents selector#id 415 """ 416
417 - def __init__(self, selector, id):
418 self.selector = selector 419 self.id = id
420
421 - def __repr__(self):
422 return '%s[%r#%s]' % ( 423 self.__class__.__name__, 424 self.selector, self.id)
425
426 - def xpath(self):
427 path = self.selector.xpath() 428 path.add_condition('@id = %s' % xpath_repr(self.id)) 429 return path
430
431 -class Or(object):
432
433 - def __init__(self, items):
434 self.items = items
435 - def __repr__(self):
436 return '%s(%r)' % ( 437 self.__class__.__name__, 438 self.items)
439
440 - def xpath(self):
441 paths = [item.xpath() for item in self.items] 442 return XPathExprOr(paths)
443
444 -class CombinedSelector(object):
445 446 _method_mapping = { 447 ' ': 'descendant', 448 '>': 'child', 449 '+': 'direct_adjacent', 450 '~': 'indirect_adjacent', 451 } 452
453 - def __init__(self, selector, combinator, subselector):
454 assert selector is not None 455 self.selector = selector 456 self.combinator = combinator 457 self.subselector = subselector
458
459 - def __repr__(self):
460 if self.combinator == ' ': 461 comb = '<followed>' 462 else: 463 comb = self.combinator 464 return '%s[%r %s %r]' % ( 465 self.__class__.__name__, 466 self.selector, 467 comb, 468 self.subselector)
469
470 - def xpath(self):
471 if self.combinator not in self._method_mapping: 472 raise ExpressionError( 473 "Unknown combinator: %r" % self.combinator) 474 method = '_xpath_' + self._method_mapping[self.combinator] 475 method = getattr(self, method) 476 path = self.selector.xpath() 477 return method(path, self.subselector)
478
479 - def _xpath_descendant(self, xpath, sub):
480 # when sub is a descendant in any way of xpath 481 xpath.join('/descendant::', sub.xpath()) 482 return xpath
483
484 - def _xpath_child(self, xpath, sub):
485 # when sub is an immediate child of xpath 486 xpath.join('/', sub.xpath()) 487 return xpath
488
489 - def _xpath_direct_adjacent(self, xpath, sub):
490 # when sub immediately follows xpath 491 xpath.join('/following-sibling::', sub.xpath()) 492 xpath.add_name_test() 493 xpath.add_condition('position() = 1') 494 return xpath
495
496 - def _xpath_indirect_adjacent(self, xpath, sub):
497 # when sub comes somewhere after xpath as a sibling 498 xpath.join('/following-sibling::', sub.xpath()) 499 return xpath
500 501 ############################## 502 ## XPathExpr objects: 503 504 _el_re = re.compile(r'^\w+\s*$') 505 _id_re = re.compile(r'^(\w*)#(\w+)\s*$') 506 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$') 507
508 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):
509 if isinstance(css_expr, _basestring): 510 match = _el_re.search(css_expr) 511 if match is not None: 512 return '%s%s' % (prefix, match.group(0).strip()) 513 match = _id_re.search(css_expr) 514 if match is not None: 515 return "%s%s[@id = '%s']" % ( 516 prefix, match.group(1) or '*', match.group(2)) 517 match = _class_re.search(css_expr) 518 if match is not None: 519 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 520 prefix, match.group(1) or '*', match.group(2)) 521 css_expr = parse(css_expr) 522 expr = css_expr.xpath() 523 assert expr is not None, ( 524 "Got None for xpath expression from %s" % repr(css_expr)) 525 if prefix: 526 expr.add_prefix(prefix) 527 return str(expr)
528
529 -class XPathExpr(object):
530
531 - def __init__(self, prefix=None, path=None, element='*', condition=None, 532 star_prefix=False):
533 self.prefix = prefix 534 self.path = path 535 self.element = element 536 self.condition = condition 537 self.star_prefix = star_prefix
538
539 - def __str__(self):
540 path = '' 541 if self.prefix is not None: 542 path += str(self.prefix) 543 if self.path is not None: 544 path += str(self.path) 545 path += str(self.element) 546 if self.condition: 547 path += '[%s]' % self.condition 548 return path
549
550 - def __repr__(self):
551 return '%s[%s]' % ( 552 self.__class__.__name__, self)
553
554 - def add_condition(self, condition):
555 if self.condition: 556 self.condition = '%s and (%s)' % (self.condition, condition) 557 else: 558 self.condition = condition
559
560 - def add_path(self, part):
561 if self.path is None: 562 self.path = self.element 563 else: 564 self.path += self.element 565 self.element = part
566
567 - def add_prefix(self, prefix):
568 if self.prefix: 569 self.prefix = prefix + self.prefix 570 else: 571 self.prefix = prefix
572
573 - def add_name_test(self):
574 if self.element == '*': 575 # We weren't doing a test anyway 576 return 577 self.add_condition("name() = %s" % xpath_repr(self.element)) 578 self.element = '*'
579
580 - def add_star_prefix(self):
581 """ 582 Adds a /* prefix if there is no prefix. This is when you need 583 to keep context's constrained to a single parent. 584 """ 585 if self.path: 586 self.path += '*/' 587 else: 588 self.path = '*/' 589 self.star_prefix = True
590
591 - def join(self, combiner, other):
592 prefix = str(self) 593 prefix += combiner 594 path = (other.prefix or '') + (other.path or '') 595 # We don't need a star prefix if we are joining to this other 596 # prefix; so we'll get rid of it 597 if other.star_prefix and path == '*/': 598 path = '' 599 self.prefix = prefix 600 self.path = path 601 self.element = other.element 602 self.condition = other.condition
603
604 -class XPathExprOr(XPathExpr):
605 """ 606 Represents |'d expressions. Note that unfortunately it isn't 607 the union, it's the sum, so duplicate elements will appear. 608 """ 609
610 - def __init__(self, items, prefix=None):
611 for item in items: 612 assert item is not None 613 self.items = items 614 self.prefix = prefix
615
616 - def __str__(self):
617 prefix = self.prefix or '' 618 return ' | '.join([prefix + str(i) for i in self.items])
619
620 -def xpath_repr(s):
621 # FIXME: I don't think this is right, but lacking any reasonable 622 # specification on what XPath literals look like (which doesn't seem 623 # to be in the XPath specification) it is hard to do 'right' 624 if isinstance(s, Element): 625 # This is probably a symbol that looks like an expression... 626 s = s._format_element() 627 return repr(str(s))
628 629 ############################## 630 ## Parsing functions 631
632 -def parse(string):
633 stream = TokenStream(tokenize(string)) 634 stream.source = string 635 try: 636 return parse_selector_group(stream) 637 except SelectorSyntaxError: 638 e = sys.exc_info()[1] 639 e.args = tuple(["%s at %s -> %s" % ( 640 e, stream.used, list(stream))]) 641 raise
642
643 -def parse_selector_group(stream):
644 result = [] 645 while 1: 646 result.append(parse_selector(stream)) 647 if stream.peek() == ',': 648 stream.next() 649 else: 650 break 651 if len(result) == 1: 652 return result[0] 653 else: 654 return Or(result)
655
656 -def parse_selector(stream):
657 result = parse_simple_selector(stream) 658 while 1: 659 peek = stream.peek() 660 if peek == ',' or peek is None: 661 return result 662 elif peek in ('+', '>', '~'): 663 # A combinator 664 combinator = stream.next() 665 else: 666 combinator = ' ' 667 next_selector = parse_simple_selector(stream) 668 result = CombinedSelector(result, combinator, next_selector) 669 return result
670
671 -def parse_simple_selector(stream):
672 peek = stream.peek() 673 if peek != '*' and not isinstance(peek, Symbol): 674 element = namespace = '*' 675 else: 676 next = stream.next() 677 if next != '*' and not isinstance(next, Symbol): 678 raise SelectorSyntaxError( 679 "Expected symbol, got %r" % next) 680 if stream.peek() == '|': 681 namespace = next 682 stream.next() 683 element = stream.next() 684 if element != '*' and not isinstance(next, Symbol): 685 raise SelectorSyntaxError( 686 "Expected symbol, got %r" % next) 687 else: 688 namespace = '*' 689 element = next 690 result = Element(namespace, element) 691 has_hash = False 692 while 1: 693 peek = stream.peek() 694 if peek == '#': 695 if has_hash: 696 # You can't have two hashes 697 # (FIXME: is there some more general rule I'm missing?) 698 break 699 stream.next() 700 result = Hash(result, stream.next()) 701 has_hash = True 702 continue 703 elif peek == '.': 704 stream.next() 705 result = Class(result, stream.next()) 706 continue 707 elif peek == '[': 708 stream.next() 709 result = parse_attrib(result, stream) 710 next = stream.next() 711 if not next == ']': 712 raise SelectorSyntaxError( 713 "] expected, got %r" % next) 714 continue 715 elif peek == ':' or peek == '::': 716 type = stream.next() 717 ident = stream.next() 718 if not isinstance(ident, Symbol): 719 raise SelectorSyntaxError( 720 "Expected symbol, got %r" % ident) 721 if stream.peek() == '(': 722 stream.next() 723 peek = stream.peek() 724 if isinstance(peek, String): 725 selector = stream.next() 726 elif isinstance(peek, Symbol) and is_int(peek): 727 selector = int(stream.next()) 728 else: 729 # FIXME: parse_simple_selector, or selector, or...? 730 selector = parse_simple_selector(stream) 731 next = stream.next() 732 if not next == ')': 733 raise SelectorSyntaxError( 734 "Expected ), got %r and %r" 735 % (next, selector)) 736 result = Function(result, type, ident, selector) 737 else: 738 result = Pseudo(result, type, ident) 739 continue 740 else: 741 if peek == ' ': 742 stream.next() 743 break 744 # FIXME: not sure what "negation" is 745 return result
746
747 -def is_int(v):
748 try: 749 int(v) 750 except ValueError: 751 return False 752 else: 753 return True
754
755 -def parse_attrib(selector, stream):
756 attrib = stream.next() 757 if stream.peek() == '|': 758 namespace = attrib 759 stream.next() 760 attrib = stream.next() 761 else: 762 namespace = '*' 763 if stream.peek() == ']': 764 return Attrib(selector, namespace, attrib, 'exists', None) 765 op = stream.next() 766 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 767 raise SelectorSyntaxError( 768 "Operator expected, got %r" % op) 769 value = stream.next() 770 if not isinstance(value, (Symbol, String)): 771 raise SelectorSyntaxError( 772 "Expected string or symbol, got %r" % value) 773 return Attrib(selector, namespace, attrib, op, value)
774
775 -def parse_series(s):
776 """ 777 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 778 """ 779 if isinstance(s, Element): 780 s = s._format_element() 781 if not s or s == '*': 782 # Happens when there's nothing, which the CSS parser thinks of as * 783 return (0, 0) 784 if isinstance(s, int): 785 # Happens when you just get a number 786 return (0, s) 787 if s == 'odd': 788 return (2, 1) 789 elif s == 'even': 790 return (2, 0) 791 elif s == 'n': 792 return (1, 0) 793 if 'n' not in s: 794 # Just a b 795 return (0, int(s)) 796 a, b = s.split('n', 1) 797 if not a: 798 a = 1 799 elif a == '-' or a == '+': 800 a = int(a+'1') 801 else: 802 a = int(a) 803 if not b: 804 b = 0 805 elif b == '-' or b == '+': 806 b = int(b+'1') 807 else: 808 b = int(b) 809 return (a, b)
810 811 812 ############################################################ 813 ## Tokenizing 814 ############################################################ 815 816 _whitespace_re = re.compile(r'\s+') 817 818 _comment_re = re.compile(r'/\*.*?\*/', re.S) 819 820 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 821
822 -def tokenize(s):
823 pos = 0 824 s = _comment_re.sub('', s) 825 while 1: 826 match = _whitespace_re.match(s, pos=pos) 827 if match: 828 preceding_whitespace_pos = pos 829 pos = match.end() 830 else: 831 preceding_whitespace_pos = 0 832 if pos >= len(s): 833 return 834 match = _count_re.match(s, pos=pos) 835 if match and match.group() != 'n': 836 sym = s[pos:match.end()] 837 yield Symbol(sym, pos) 838 pos = match.end() 839 continue 840 c = s[pos] 841 c2 = s[pos:pos+2] 842 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 843 yield Token(c2, pos) 844 pos += 2 845 continue 846 if c in '>+~,.*=[]()|:#': 847 if c in '.#' and preceding_whitespace_pos > 0: 848 yield Token(' ', preceding_whitespace_pos) 849 yield Token(c, pos) 850 pos += 1 851 continue 852 if c == '"' or c == "'": 853 # Quoted string 854 old_pos = pos 855 sym, pos = tokenize_escaped_string(s, pos) 856 yield String(sym, old_pos) 857 continue 858 old_pos = pos 859 sym, pos = tokenize_symbol(s, pos) 860 yield Symbol(sym, old_pos) 861 continue
862
863 -def tokenize_escaped_string(s, pos):
864 quote = s[pos] 865 assert quote in ('"', "'") 866 pos = pos+1 867 start = pos 868 while 1: 869 next = s.find(quote, pos) 870 if next == -1: 871 raise SelectorSyntaxError( 872 "Expected closing %s for string in: %r" 873 % (quote, s[start:])) 874 result = s[start:next] 875 try: 876 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 877 except UnicodeDecodeError: 878 # Probably a hanging \ 879 pos = next+1 880 else: 881 return result, next+1
882 883 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 884
885 -def tokenize_symbol(s, pos):
886 start = pos 887 match = _illegal_symbol.search(s, pos=pos) 888 if not match: 889 # Goes to end of s 890 return s[start:], len(s) 891 if match.start() == pos: 892 assert 0, ( 893 "Unexpected symbol: %r at %s" % (s[pos], pos)) 894 if not match: 895 result = s[start:] 896 pos = len(s) 897 else: 898 result = s[start:match.start()] 899 pos = match.start() 900 try: 901 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 902 except UnicodeDecodeError: 903 e = sys.exc_info()[1] 904 raise SelectorSyntaxError( 905 "Bad symbol %r: %s" % (result, e)) 906 return result, pos
907
908 -class TokenStream(object):
909
910 - def __init__(self, tokens, source=None):
911 self.used = [] 912 self.tokens = iter(tokens) 913 self.source = source 914 self.peeked = None 915 self._peeking = False 916 try: 917 self.next_token = self.tokens.next 918 except AttributeError: 919 # Python 3 920 self.next_token = self.tokens.__next__
921
922 - def next(self):
923 if self._peeking: 924 self._peeking = False 925 self.used.append(self.peeked) 926 return self.peeked 927 else: 928 try: 929 next = self.next_token() 930 self.used.append(next) 931 return next 932 except StopIteration: 933 return None
934
935 - def __iter__(self):
936 return iter(self.next, None)
937
938 - def peek(self):
939 if not self._peeking: 940 try: 941 self.peeked = self.next_token() 942 except StopIteration: 943 return None 944 self._peeking = True 945 return self.peeked
946