Package libxyz :: Package parser :: Module lexer
[hide private]
[frames] | no frames]

Source Code for Module libxyz.parser.lexer

  1  #-*- coding: utf8 -* 
  2  # 
  3  # Max E. Kuznecov ~syhpoon <syhpoon@syhpoon.name> 2008 
  4  # 
  5  # This file is part of XYZCommander. 
  6  # XYZCommander is free software: you can redistribute it and/or modify 
  7  # it under the terms of the GNU Lesser Public License as published by 
  8  # the Free Software Foundation, either version 3 of the License, or 
  9  # (at your option) any later version. 
 10  # XYZCommander is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
 13  # GNU Lesser Public License for more details. 
 14  # You should have received a copy of the GNU Lesser Public License 
 15  # along with XYZCommander. If not, see <http://www.gnu.org/licenses/>. 
 16   
 17  from libxyz.parser import SourceData 
 18  from libxyz.exceptions import LexerError 
 19   
20 -class Lexer(object):
21 """ 22 Lexical analyzer 23 24 Lexer rules: 25 ----------- 26 * Blank chars are usually ignored. Except from in quotes. 27 * Quote can be one-line: "quoted value", or multiline: 28 '''quoted value1, 29 quoted value2, 30 ''' 31 * New-line char ends commented line if any. 32 * Values can be provided as simple literals or quoted ones. 33 * If value contains spaces or any other non-alphanumeric values it is better 34 to quote it or escape it using escapechar. 35 * Variable can take list of values, separated by comma 36 * Escaping can only be used in rval position. 37 38 Macros: 39 ------ 40 Macros are special internal variables that get expanded upon parsing. 41 Macro definition is similar to variable definition, but macro char 42 (default '&') is prepended to var name: 43 &macro = value 44 var = &macro 45 """ 46 47 TOKEN_IDT = 0 48 TOKEN_MACRO = 1 49
50 - def __init__(self, source, tokens, comment=u"#", macro=u"&"):
51 """ 52 @param source: Parsing source. If file object is passed, it must be 53 closed by caller function after parsing completes. 54 @type source: string, file-like object or SourceData object 55 56 @param tokens: List of tokens 57 @type tokens: sequence 58 59 @param comment: Comment char 60 @param macro: Macros char 61 """ 62 63 if isinstance(source, SourceData): 64 self.sdata = source 65 else: 66 self.sdata = SourceData(source) 67 68 self.tokens = tokens 69 self.comment = comment 70 self.macro = macro 71 72 self._escapechar = u"\\" 73 self._xqchar = u"'" 74 self._xqcount = 3 75 self._xqtotal = 0 76 self._skip_next = 0 77 78 # Should be set to True when done parsing 79 self._done = False 80 # Should be set to True when parsing id can use escaped characters 81 self._can_escape = False 82 self._escaped = False 83 self._in_quote = False 84 self._in_xquote = False 85 self._in_comment = False 86 # Keeps next token 87 self._idt = []
88 89 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90
91 - def lexer(self):
92 """ 93 Scan input for lexemes and return to parser 94 95 @return: typle (token_type, token_value) 96 """ 97 98 def _token_type(tok): 99 """ 100 Determine token type 101 """ 102 103 _type = self.TOKEN_IDT 104 _tok = tok 105 106 if tok and self.macro and tok[0] == self.macro: 107 _type = self.TOKEN_MACRO 108 _tok = tok[1:] 109 110 return (_type, _tok)
111 112 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 113 114 _quoted = False 115 116 for char in self.sdata: 117 if self._done: 118 self.unget(char) 119 return None 120 121 if self._in_comment and char != u"\n": 122 continue 123 124 if self._skip_next == 0: 125 if 0 < self._xqtotal < self._xqcount: 126 if char != self._xqchar: 127 # Put read-ahead chars back 128 _back_tk = "%s%s" %(self._xqchar * self._xqtotal, char) 129 self.unget(_back_tk) 130 self._skip_next = len(_back_tk) 131 self._xqtotal = 0 132 continue 133 134 if char == self._xqchar: 135 self._xqtotal += 1 136 137 # Assembled xquote 138 if self._xqtotal == self._xqcount: 139 if self._in_xquote: 140 # Finishing 141 self._in_xquote = False 142 else: 143 # Beginning 144 self._in_xquote = True 145 _quoted = True 146 147 self._xqtotal = 0 148 149 continue 150 else: 151 self._skip_next -= 1 152 153 if self._in_xquote: 154 self._idt.append(char) 155 continue 156 157 # Escape only when allowed, usually in values 158 if self._can_escape: 159 if self._escaped: 160 self._idt.append(char) 161 self._escaped = False 162 continue 163 164 if char == self._escapechar: 165 self._escaped = True 166 continue 167 168 if char == u"\n": 169 if self._in_quote: 170 raise LexerError(_(u"Unterminated quote")) 171 172 _token = None 173 174 if self._idt or _quoted: 175 _token = u"".join(self._idt) 176 self._idt = [] 177 _quoted = False 178 else: 179 self._in_comment = False 180 181 if char in self.tokens: 182 if _token is not None: 183 self.unget(char) 184 else: 185 _token = char 186 187 if _token is not None: 188 return _token_type(_token) 189 else: 190 continue 191 192 if char == u'"': 193 if self._in_quote: 194 self._in_quote = False 195 else: 196 self._in_quote = True 197 _quoted = True 198 199 continue 200 201 if self._in_quote: 202 self._idt.append(char) 203 continue 204 205 if char in self.tokens or char.isspace(): 206 _token = None 207 208 # Check if we finished assembling the token 209 if self._idt or _quoted: 210 _token = u"".join(self._idt) 211 self._idt = [] 212 _quoted = False 213 if not char.isspace(): 214 if _token is not None: 215 self.unget(char) 216 else: 217 _token = char 218 219 if _token is not None: 220 return _token_type(_token) 221 else: 222 continue 223 224 if char == self.comment and not self._in_xquote: 225 # skip to the EOL 226 self._in_comment = True 227 continue 228 229 self._idt.append(char) 230 231 if self._idt: 232 _token = u"".join(self._idt) 233 self._idt = [] 234 return _token_type(_token)
235 236 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 237
238 - def get_idt(self):
239 """ 240 Return current state of token buffer 241 """ 242 243 return self._idt
244 245 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 246
247 - def done(self):
248 """ 249 Order lexer to stop processing 250 """ 251 252 self._done = True
253 254 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
255 - def unget(self, token):
256 """ 257 Put read token back to input stream 258 """ 259 260 self.sdata.unget(token)
261 262 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 263
264 - def escaping_on(self):
265 """ 266 Enable escaping 267 """ 268 269 self._can_escape = True
270 271 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 272
273 - def escaping_off(self):
274 """ 275 Disable escaping 276 """ 277 278 self._can_escape = False
279