__file__ = 'ky_lexical_scanner.py' __usage__ = ''' = LexicalScanner( input string ) input string - eg '3+56/89^2. Arbitrary code fragments. returns a Token object describing the current token and the surrounding context tokens.''' __description__ = '''Ky lexical scanner class. A general purpose lexical scanner capable of the standard functions.''' __author__ = 'Timothy Wakeham (timmeh)' __contact__ = 'timmeh@hiddenworlds.org' __updated__ = '28-03-2008 @ 22:02:13' __todo__ = None #start from ky_token import Token from ky_error import Error from ky_operator import Operator, UnaryOperator class LexicalScanner: ID = 'KY_LEXICAL_SCANNER' NUMBER = '0123456789' ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMMNOPQRSTUWXYZ' ALPHANUM = ALPHA + NUMBER OPERATOR = '+-/*^=<>&|~' UNARYOPS = '-!' NEWLINE = '\n' WHITESPACE = '\t ' EOF = 'EOF' KEYWORDS = ['IF', 'THEN', 'ELSE'] def __init__(self, code): self.code = code self.code_length = len(code) self.idx = 0 self.line = 1 self.line_buffer = '' self.last_idx = 0 self.last_token = None self.next_token = None for operator in Operator.operators: LexicalScanner.OPERATOR += operator for operator in UnaryOperator.unary_operators: LexicalScanner.UNARYOPS += operator def match_char(self, char, match_str): if char in match_str: return True return False def match_symbol(self, symbol, match_str): if sum([1 for i in symbol if self.match_char(i, match_str)]) == len(symbol): return True return False def get_character(self): self.idx += 1 if self.idx > self.code_length: return LexicalScanner.EOF return self.code[self.idx-1] def unget_character(self): self.idx -= 1 return def unget_symbol(self, idx): self.idx = idx return def get_symbol(self, next=True): while(1): char = self.get_character() # End-Of-File if char == LexicalScanner.EOF: token = Token(Token.EOF, None, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx return token # Newline if self.match_char(char, LexicalScanner.NEWLINE): self.line += 1 self.line_buffer = '' continue # Whitespace elif self.match_char(char, LexicalScanner.WHITESPACE): continue # Number elif self.match_char(char, LexicalScanner.NUMBER): number = '' while(self.match_char(char, LexicalScanner.NUMBER)): number += char char = self.get_character() self.unget_character() #push input back because we don't need this character token = Token(Token.NUMBER, number, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token # Operator elif self.match_char(char, LexicalScanner.OPERATOR): operator = '' while(self.match_char(char, LexicalScanner.OPERATOR)): operator += char char = self.get_character() self.unget_character() #push input back because we don't need this character #~ #check operator is valid if not Operator.operators.has_key(operator): for idx in range(len(operator)): operator = operator[:-1] self.unget_character() if Operator.operators.has_key(operator): break token = Token(Token.OPERATOR, Operator.operators[operator], self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if self.last_token.prev is None or (self.last_token.prev.prev and self.last_token.prev.prev.token_type == Token.OPERATOR): token.token_type = Token.UNOP if token.token_value not in [Operator.operators['('], Operator.operators[')']] else Token.OPERATOR if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token # Identifier elif self.match_char(char, LexicalScanner.ALPHA): identifier = '' while(self.match_char(char, LexicalScanner.ALPHANUM)): identifier += char char = self.get_character() self.unget_character() #push input back because we don't need this character token_type = Token.KEYWORD if identifier in LexicalScanner.KEYWORDS else Token.IDENT token = Token(token_type, identifier, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) return token else: token = Token(Token.UNKNOWN, char, self.last_idx, self.line, self.last_token, self.next_token) self.last_idx = self.idx index = self.idx self.last_token = token if next: token.next = self.get_symbol(next=False) self.unget_symbol(index) if token.token_value == '(': token.token_type = Token.OPERATOR token.token_value = Operator.operators['('] if token.token_value == ')': token.token_type = Token.OPERATOR token.token_value = Operator.operators[')'] return token if __name__ == '__main__': ass = Operator('=', 'POP') add = Operator('+', 'ADD') sub = Operator('-', 'SUB') mul = Operator('*', 'MUL') div = Operator('++', 'DIV') exp = Operator('^', 'POW') lexer = LexicalScanner('345+++98-67+8^77') while(1): token = lexer.get_symbol() print token if token.prev: print token.prev.prev if token.token_type == Token.EOF: break