Commit 5607fabd authored by Stefan Behnel's avatar Stefan Behnel

reformat Plex code files

parent 727e57d9
...@@ -7,98 +7,101 @@ ...@@ -7,98 +7,101 @@
#======================================================================= #=======================================================================
class Action(object): class Action(object):
def perform(self, token_stream, text):
pass # abstract
def perform(self, token_stream, text): def same_as(self, other):
pass # abstract return self is other
def same_as(self, other):
return self is other
class Return(Action): class Return(Action):
""" """
Internal Plex action which causes |value| to Internal Plex action which causes |value| to
be returned as the value of the associated token be returned as the value of the associated token
""" """
def __init__(self, value): def __init__(self, value):
self.value = value self.value = value
def perform(self, token_stream, text): def perform(self, token_stream, text):
return self.value return self.value
def same_as(self, other): def same_as(self, other):
return isinstance(other, Return) and self.value == other.value return isinstance(other, Return) and self.value == other.value
def __repr__(self): def __repr__(self):
return "Return(%s)" % repr(self.value) return "Return(%s)" % repr(self.value)
class Call(Action): class Call(Action):
""" """
Internal Plex action which causes a function to be called. Internal Plex action which causes a function to be called.
""" """
def __init__(self, function): def __init__(self, function):
self.function = function self.function = function
def perform(self, token_stream, text): def perform(self, token_stream, text):
return self.function(token_stream, text) return self.function(token_stream, text)
def __repr__(self): def __repr__(self):
return "Call(%s)" % self.function.__name__ return "Call(%s)" % self.function.__name__
def same_as(self, other): def same_as(self, other):
return isinstance(other, Call) and self.function is other.function return isinstance(other, Call) and self.function is other.function
class Begin(Action): class Begin(Action):
""" """
Begin(state_name) is a Plex action which causes the Scanner to Begin(state_name) is a Plex action which causes the Scanner to
enter the state |state_name|. See the docstring of Plex.Lexicon enter the state |state_name|. See the docstring of Plex.Lexicon
for more information. for more information.
""" """
def __init__(self, state_name): def __init__(self, state_name):
self.state_name = state_name self.state_name = state_name
def perform(self, token_stream, text): def perform(self, token_stream, text):
token_stream.begin(self.state_name) token_stream.begin(self.state_name)
def __repr__(self): def __repr__(self):
return "Begin(%s)" % self.state_name return "Begin(%s)" % self.state_name
def same_as(self, other): def same_as(self, other):
return isinstance(other, Begin) and self.state_name == other.state_name return isinstance(other, Begin) and self.state_name == other.state_name
class Ignore(Action): class Ignore(Action):
""" """
IGNORE is a Plex action which causes its associated token IGNORE is a Plex action which causes its associated token
to be ignored. See the docstring of Plex.Lexicon for more to be ignored. See the docstring of Plex.Lexicon for more
information. information.
""" """
def perform(self, token_stream, text):
return None def perform(self, token_stream, text):
return None
def __repr__(self):
return "IGNORE"
def __repr__(self):
return "IGNORE"
IGNORE = Ignore() IGNORE = Ignore()
#IGNORE.__doc__ = Ignore.__doc__ #IGNORE.__doc__ = Ignore.__doc__
class Text(Action): class Text(Action):
""" """
TEXT is a Plex action which causes the text of a token to TEXT is a Plex action which causes the text of a token to
be returned as the value of the token. See the docstring of be returned as the value of the token. See the docstring of
Plex.Lexicon for more information. Plex.Lexicon for more information.
""" """
def perform(self, token_stream, text):
return text
def perform(self, token_stream, text): def __repr__(self):
return text return "TEXT"
def __repr__(self):
return "TEXT"
TEXT = Text() TEXT = Text()
#TEXT.__doc__ = Text.__doc__ #TEXT.__doc__ = Text.__doc__
......
...@@ -13,147 +13,152 @@ from .Machines import LOWEST_PRIORITY ...@@ -13,147 +13,152 @@ from .Machines import LOWEST_PRIORITY
from .Transitions import TransitionMap from .Transitions import TransitionMap
def nfa_to_dfa(old_machine, debug = None): def nfa_to_dfa(old_machine, debug=None):
""" """
Given a nondeterministic Machine, return a new equivalent Given a nondeterministic Machine, return a new equivalent
Machine which is deterministic. Machine which is deterministic.
""" """
# We build a new machine whose states correspond to sets of states # We build a new machine whose states correspond to sets of states
# in the old machine. Initially we add a new state corresponding to # in the old machine. Initially we add a new state corresponding to
# the epsilon-closure of each initial old state. Then we give transitions # the epsilon-closure of each initial old state. Then we give transitions
# to each new state which are the union of all transitions out of any # to each new state which are the union of all transitions out of any
# of the corresponding old states. The new state reached on a given # of the corresponding old states. The new state reached on a given
# character is the one corresponding to the set of states reachable # character is the one corresponding to the set of states reachable
# on that character from any of the old states. As new combinations of # on that character from any of the old states. As new combinations of
# old states are created, new states are added as needed until closure # old states are created, new states are added as needed until closure
# is reached. # is reached.
new_machine = Machines.FastMachine() new_machine = Machines.FastMachine()
state_map = StateMap(new_machine) state_map = StateMap(new_machine)
# Seed the process using the initial states of the old machine. # Seed the process using the initial states of the old machine.
# Make the corresponding new states into initial states of the new # Make the corresponding new states into initial states of the new
# machine with the same names. # machine with the same names.
for (key, old_state) in old_machine.initial_states.iteritems(): for (key, old_state) in old_machine.initial_states.iteritems():
new_state = state_map.old_to_new(epsilon_closure(old_state)) new_state = state_map.old_to_new(epsilon_closure(old_state))
new_machine.make_initial_state(key, new_state) new_machine.make_initial_state(key, new_state)
# Tricky bit here: we add things to the end of this list while we're # Tricky bit here: we add things to the end of this list while we're
# iterating over it. The iteration stops when closure is achieved. # iterating over it. The iteration stops when closure is achieved.
for new_state in new_machine.states: for new_state in new_machine.states:
transitions = TransitionMap() transitions = TransitionMap()
for old_state in state_map.new_to_old(new_state): for old_state in state_map.new_to_old(new_state):
for event, old_target_states in old_state.transitions.iteritems(): for event, old_target_states in old_state.transitions.iteritems():
if event and old_target_states: if event and old_target_states:
transitions.add_set(event, set_epsilon_closure(old_target_states)) transitions.add_set(event, set_epsilon_closure(old_target_states))
for event, old_states in transitions.iteritems(): for event, old_states in transitions.iteritems():
new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states)) new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states))
if debug: if debug:
debug.write("\n===== State Mapping =====\n") debug.write("\n===== State Mapping =====\n")
state_map.dump(debug) state_map.dump(debug)
return new_machine return new_machine
def set_epsilon_closure(state_set): def set_epsilon_closure(state_set):
""" """
Given a set of states, return the union of the epsilon Given a set of states, return the union of the epsilon
closures of its member states. closures of its member states.
""" """
result = {} result = {}
for state1 in state_set: for state1 in state_set:
for state2 in epsilon_closure(state1): for state2 in epsilon_closure(state1):
result[state2] = 1 result[state2] = 1
return result return result
def epsilon_closure(state): def epsilon_closure(state):
""" """
Return the set of states reachable from the given state Return the set of states reachable from the given state
by epsilon moves. by epsilon moves.
""" """
# Cache the result # Cache the result
result = state.epsilon_closure result = state.epsilon_closure
if result is None: if result is None:
result = {} result = {}
state.epsilon_closure = result state.epsilon_closure = result
add_to_epsilon_closure(result, state) add_to_epsilon_closure(result, state)
return result return result
def add_to_epsilon_closure(state_set, state):
"""
Recursively add to |state_set| states reachable from the given state
by epsilon moves.
"""
if not state_set.get(state, 0):
state_set[state] = 1
state_set_2 = state.transitions.get_epsilon()
if state_set_2:
for state2 in state_set_2:
add_to_epsilon_closure(state_set, state2)
class StateMap(object): def add_to_epsilon_closure(state_set, state):
"""
Helper class used by nfa_to_dfa() to map back and forth between
sets of states from the old machine and states of the new machine.
"""
new_machine = None # Machine
old_to_new_dict = None # {(old_state,...) : new_state}
new_to_old_dict = None # {id(new_state) : old_state_set}
def __init__(self, new_machine):
self.new_machine = new_machine
self.old_to_new_dict = {}
self.new_to_old_dict= {}
def old_to_new(self, old_state_set):
""" """
Return the state of the new machine corresponding to the Recursively add to |state_set| states reachable from the given state
set of old machine states represented by |state_set|. A new by epsilon moves.
state will be created if necessary. If any of the old states
are accepting states, the new state will be an accepting state
with the highest priority action from the old states.
""" """
key = self.make_key(old_state_set) if not state_set.get(state, 0):
new_state = self.old_to_new_dict.get(key, None) state_set[state] = 1
if not new_state: state_set_2 = state.transitions.get_epsilon()
action = self.highest_priority_action(old_state_set) if state_set_2:
new_state = self.new_machine.new_state(action) for state2 in state_set_2:
self.old_to_new_dict[key] = new_state add_to_epsilon_closure(state_set, state2)
self.new_to_old_dict[id(new_state)] = old_state_set
#for old_state in old_state_set.keys():
#new_state.merge_actions(old_state) class StateMap(object):
return new_state
def highest_priority_action(self, state_set):
best_action = None
best_priority = LOWEST_PRIORITY
for state in state_set:
priority = state.action_priority
if priority > best_priority:
best_action = state.action
best_priority = priority
return best_action
# def old_to_new_set(self, old_state_set):
# """
# Return the new state corresponding to a set of old states as
# a singleton set.
# """
# return {self.old_to_new(old_state_set):1}
def new_to_old(self, new_state):
"""Given a new state, return a set of corresponding old states."""
return self.new_to_old_dict[id(new_state)]
def make_key(self, state_set):
""" """
Convert a set of states into a uniquified Helper class used by nfa_to_dfa() to map back and forth between
sorted tuple suitable for use as a dictionary key. sets of states from the old machine and states of the new machine.
""" """
lst = list(state_set) new_machine = None # Machine
lst.sort() old_to_new_dict = None # {(old_state,...) : new_state}
return tuple(lst) new_to_old_dict = None # {id(new_state) : old_state_set}
def dump(self, file): def __init__(self, new_machine):
from .Transitions import state_set_str self.new_machine = new_machine
for new_state in self.new_machine.states: self.old_to_new_dict = {}
old_state_set = self.new_to_old_dict[id(new_state)] self.new_to_old_dict = {}
file.write(" State %s <-- %s\n" % (
new_state['number'], state_set_str(old_state_set))) def old_to_new(self, old_state_set):
"""
Return the state of the new machine corresponding to the
set of old machine states represented by |state_set|. A new
state will be created if necessary. If any of the old states
are accepting states, the new state will be an accepting state
with the highest priority action from the old states.
"""
key = self.make_key(old_state_set)
new_state = self.old_to_new_dict.get(key, None)
if not new_state:
action = self.highest_priority_action(old_state_set)
new_state = self.new_machine.new_state(action)
self.old_to_new_dict[key] = new_state
self.new_to_old_dict[id(new_state)] = old_state_set
#for old_state in old_state_set.keys():
#new_state.merge_actions(old_state)
return new_state
def highest_priority_action(self, state_set):
best_action = None
best_priority = LOWEST_PRIORITY
for state in state_set:
priority = state.action_priority
if priority > best_priority:
best_action = state.action
best_priority = priority
return best_action
# def old_to_new_set(self, old_state_set):
# """
# Return the new state corresponding to a set of old states as
# a singleton set.
# """
# return {self.old_to_new(old_state_set):1}
def new_to_old(self, new_state):
"""Given a new state, return a set of corresponding old states."""
return self.new_to_old_dict[id(new_state)]
def make_key(self, state_set):
"""
Convert a set of states into a uniquified
sorted tuple suitable for use as a dictionary key.
"""
lst = list(state_set)
lst.sort()
return tuple(lst)
def dump(self, file):
from .Transitions import state_set_str
for new_state in self.new_machine.states:
old_state_set = self.new_to_old_dict[id(new_state)]
file.write(" State %s <-- %s\n" % (
new_state['number'], state_set_str(old_state_set)))
...@@ -6,45 +6,49 @@ ...@@ -6,45 +6,49 @@
# #
#======================================================================= #=======================================================================
class PlexError(Exception): class PlexError(Exception):
message = "" message = ""
class PlexTypeError(PlexError, TypeError): class PlexTypeError(PlexError, TypeError):
pass pass
class PlexValueError(PlexError, ValueError): class PlexValueError(PlexError, ValueError):
pass pass
class InvalidRegex(PlexError): class InvalidRegex(PlexError):
pass pass
class InvalidToken(PlexError): class InvalidToken(PlexError):
def __init__(self, token_number, message):
PlexError.__init__(self, "Token number %d: %s" % (token_number, message))
def __init__(self, token_number, message):
PlexError.__init__(self, "Token number %d: %s" % (token_number, message))
class InvalidScanner(PlexError): class InvalidScanner(PlexError):
pass
class AmbiguousAction(PlexError):
message = "Two tokens with different actions can match the same string"
def __init__(self):
pass pass
class UnrecognizedInput(PlexError):
scanner = None
position = None
state_name = None
def __init__(self, scanner, state_name): class AmbiguousAction(PlexError):
self.scanner = scanner message = "Two tokens with different actions can match the same string"
self.position = scanner.get_position()
self.state_name = state_name
def __str__(self):
return ("'%s', line %d, char %d: Token not recognised in state %s"
% (self.position + (repr(self.state_name),)))
def __init__(self):
pass
class UnrecognizedInput(PlexError):
scanner = None
position = None
state_name = None
def __init__(self, scanner, state_name):
self.scanner = scanner
self.position = scanner.get_position()
self.state_name = state_name
def __str__(self):
return ("'%s', line %d, char %d: Token not recognised in state %s" % (
self.position + (repr(self.state_name),)))
...@@ -22,177 +22,179 @@ DUMP_DFA = 2 ...@@ -22,177 +22,179 @@ DUMP_DFA = 2
class State(object): class State(object):
""" """
This class is used as part of a Plex.Lexicon specification to This class is used as part of a Plex.Lexicon specification to
introduce a user-defined state. introduce a user-defined state.
Constructor: Constructor:
State(name, token_specifications) State(name, token_specifications)
""" """
name = None name = None
tokens = None tokens = None
def __init__(self, name, tokens):
self.name = name
self.tokens = tokens
def __init__(self, name, tokens):
self.name = name
self.tokens = tokens
class Lexicon(object): class Lexicon(object):
""" """
Lexicon(specification) builds a lexical analyser from the given Lexicon(specification) builds a lexical analyser from the given
|specification|. The specification consists of a list of |specification|. The specification consists of a list of
specification items. Each specification item may be either: specification items. Each specification item may be either:
1) A token definition, which is a tuple: 1) A token definition, which is a tuple:
(pattern, action) (pattern, action)
The |pattern| is a regular axpression built using the The |pattern| is a regular axpression built using the
constructors defined in the Plex module. constructors defined in the Plex module.
The |action| is the action to be performed when this pattern The |action| is the action to be performed when this pattern
is recognised (see below). is recognised (see below).
2) A state definition: 2) A state definition:
State(name, tokens) State(name, tokens)
where |name| is a character string naming the state, where |name| is a character string naming the state,
and |tokens| is a list of token definitions as and |tokens| is a list of token definitions as
above. The meaning and usage of states is described above. The meaning and usage of states is described
below. below.
Actions Actions
------- -------
The |action| in a token specication may be one of three things: The |action| in a token specication may be one of three things:
1) A function, which is called as follows: 1) A function, which is called as follows:
function(scanner, text) function(scanner, text)
where |scanner| is the relevant Scanner instance, and |text| where |scanner| is the relevant Scanner instance, and |text|
is the matched text. If the function returns anything is the matched text. If the function returns anything
other than None, that value is returned as the value of the other than None, that value is returned as the value of the
token. If it returns None, scanning continues as if the IGNORE token. If it returns None, scanning continues as if the IGNORE
action were specified (see below). action were specified (see below).
2) One of the following special actions: 2) One of the following special actions:
IGNORE means that the recognised characters will be treated as IGNORE means that the recognised characters will be treated as
white space and ignored. Scanning will continue until white space and ignored. Scanning will continue until
the next non-ignored token is recognised before returning. the next non-ignored token is recognised before returning.
TEXT causes the scanned text itself to be returned as the TEXT causes the scanned text itself to be returned as the
value of the token. value of the token.
3) Any other value, which is returned as the value of the token. 3) Any other value, which is returned as the value of the token.
States States
------ ------
At any given time, the scanner is in one of a number of states. At any given time, the scanner is in one of a number of states.
Associated with each state is a set of possible tokens. When scanning, Associated with each state is a set of possible tokens. When scanning,
only tokens associated with the current state are recognised. only tokens associated with the current state are recognised.
There is a default state, whose name is the empty string. Token There is a default state, whose name is the empty string. Token
definitions which are not inside any State definition belong to definitions which are not inside any State definition belong to
the default state. the default state.
The initial state of the scanner is the default state. The state can The initial state of the scanner is the default state. The state can
be changed in one of two ways: be changed in one of two ways:
1) Using Begin(state_name) as the action of a token. 1) Using Begin(state_name) as the action of a token.
2) Calling the begin(state_name) method of the Scanner. 2) Calling the begin(state_name) method of the Scanner.
To change back to the default state, use '' as the state name. To change back to the default state, use '' as the state name.
""" """
machine = None # Machine machine = None # Machine
tables = None # StateTableMachine tables = None # StateTableMachine
def __init__(self, specifications, debug = None, debug_flags = 7, timings = None): def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
if type(specifications) != types.ListType: if type(specifications) != types.ListType:
raise Errors.InvalidScanner("Scanner definition is not a list") raise Errors.InvalidScanner("Scanner definition is not a list")
if timings: if timings:
from .Timing import time from .Timing import time
total_time = 0.0
time1 = time() total_time = 0.0
nfa = Machines.Machine() time1 = time()
default_initial_state = nfa.new_initial_state('') nfa = Machines.Machine()
token_number = 1 default_initial_state = nfa.new_initial_state('')
for spec in specifications: token_number = 1
if isinstance(spec, State): for spec in specifications:
user_initial_state = nfa.new_initial_state(spec.name) if isinstance(spec, State):
for token in spec.tokens: user_initial_state = nfa.new_initial_state(spec.name)
self.add_token_to_machine( for token in spec.tokens:
nfa, user_initial_state, token, token_number) self.add_token_to_machine(
token_number = token_number + 1 nfa, user_initial_state, token, token_number)
elif type(spec) == types.TupleType: token_number += 1
self.add_token_to_machine( elif type(spec) == types.TupleType:
nfa, default_initial_state, spec, token_number) self.add_token_to_machine(
token_number = token_number + 1 nfa, default_initial_state, spec, token_number)
else: token_number += 1
raise Errors.InvalidToken( else:
token_number, raise Errors.InvalidToken(
"Expected a token definition (tuple) or State instance") token_number,
if timings: "Expected a token definition (tuple) or State instance")
time2 = time() if timings:
total_time = total_time + (time2 - time1) time2 = time()
time3 = time() total_time = total_time + (time2 - time1)
if debug and (debug_flags & 1): time3 = time()
debug.write("\n============= NFA ===========\n") if debug and (debug_flags & 1):
nfa.dump(debug) debug.write("\n============= NFA ===========\n")
dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug) nfa.dump(debug)
if timings: dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
time4 = time() if timings:
total_time = total_time + (time4 - time3) time4 = time()
if debug and (debug_flags & 2): total_time = total_time + (time4 - time3)
debug.write("\n============= DFA ===========\n") if debug and (debug_flags & 2):
dfa.dump(debug) debug.write("\n============= DFA ===========\n")
if timings: dfa.dump(debug)
timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) if timings:
timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
timings.write("TOTAL : %5.2f\n" % total_time) timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
self.machine = dfa timings.write("TOTAL : %5.2f\n" % total_time)
self.machine = dfa
def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
try: def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
(re, action_spec) = self.parse_token_definition(token_spec)
# Disabled this -- matching empty strings can be useful
#if re.nullable:
# raise Errors.InvalidToken(
# token_number, "Pattern can match 0 input symbols")
if isinstance(action_spec, Actions.Action):
action = action_spec
else:
try: try:
action_spec.__call__ (re, action_spec) = self.parse_token_definition(token_spec)
except AttributeError: # Disabled this -- matching empty strings can be useful
action = Actions.Return(action_spec) #if re.nullable:
else: # raise Errors.InvalidToken(
action = Actions.Call(action_spec) # token_number, "Pattern can match 0 input symbols")
final_state = machine.new_state() if isinstance(action_spec, Actions.Action):
re.build_machine(machine, initial_state, final_state, action = action_spec
match_bol = 1, nocase = 0) else:
final_state.set_action(action, priority = -token_number) try:
except Errors.PlexError, e: action_spec.__call__
raise e.__class__("Token number %d: %s" % (token_number, e)) except AttributeError:
action = Actions.Return(action_spec)
def parse_token_definition(self, token_spec): else:
if type(token_spec) != types.TupleType: action = Actions.Call(action_spec)
raise Errors.InvalidToken("Token definition is not a tuple") final_state = machine.new_state()
if len(token_spec) != 2: re.build_machine(machine, initial_state, final_state,
raise Errors.InvalidToken("Wrong number of items in token definition") match_bol=1, nocase=0)
pattern, action = token_spec final_state.set_action(action, priority=-token_number)
if not isinstance(pattern, Regexps.RE): except Errors.PlexError, e:
raise Errors.InvalidToken("Pattern is not an RE instance") raise e.__class__("Token number %d: %s" % (token_number, e))
return (pattern, action)
def parse_token_definition(self, token_spec):
def get_initial_state(self, name): if type(token_spec) != types.TupleType:
return self.machine.get_initial_state(name) raise Errors.InvalidToken("Token definition is not a tuple")
if len(token_spec) != 2:
raise Errors.InvalidToken("Wrong number of items in token definition")
pattern, action = token_spec
if not isinstance(pattern, Regexps.RE):
raise Errors.InvalidToken("Pattern is not an RE instance")
return (pattern, action)
def get_initial_state(self, name):
return self.machine.get_initial_state(name)
...@@ -16,244 +16,245 @@ LOWEST_PRIORITY = -sys.maxint ...@@ -16,244 +16,245 @@ LOWEST_PRIORITY = -sys.maxint
class Machine(object): class Machine(object):
"""A collection of Nodes representing an NFA or DFA.""" """A collection of Nodes representing an NFA or DFA."""
states = None # [Node] states = None # [Node]
next_state_number = 1 next_state_number = 1
initial_states = None # {(name, bol): Node} initial_states = None # {(name, bol): Node}
def __init__(self): def __init__(self):
self.states = [] self.states = []
self.initial_states = {} self.initial_states = {}
def __del__(self): def __del__(self):
#print "Destroying", self ### #print "Destroying", self ###
for state in self.states: for state in self.states:
state.destroy() state.destroy()
def new_state(self): def new_state(self):
"""Add a new state to the machine and return it.""" """Add a new state to the machine and return it."""
s = Node() s = Node()
n = self.next_state_number n = self.next_state_number
self.next_state_number = n + 1 self.next_state_number = n + 1
s.number = n s.number = n
self.states.append(s) self.states.append(s)
return s return s
def new_initial_state(self, name): def new_initial_state(self, name):
state = self.new_state() state = self.new_state()
self.make_initial_state(name, state) self.make_initial_state(name, state)
return state return state
def make_initial_state(self, name, state): def make_initial_state(self, name, state):
self.initial_states[name] = state self.initial_states[name] = state
def get_initial_state(self, name): def get_initial_state(self, name):
return self.initial_states[name] return self.initial_states[name]
def dump(self, file): def dump(self, file):
file.write("Plex.Machine:\n") file.write("Plex.Machine:\n")
if self.initial_states is not None: if self.initial_states is not None:
file.write(" Initial states:\n") file.write(" Initial states:\n")
for (name, state) in self.initial_states.iteritems(): for (name, state) in self.initial_states.iteritems():
file.write(" '%s': %d\n" % (name, state.number)) file.write(" '%s': %d\n" % (name, state.number))
for s in self.states: for s in self.states:
s.dump(file) s.dump(file)
class Node(object): class Node(object):
"""A state of an NFA or DFA.""" """A state of an NFA or DFA."""
transitions = None # TransitionMap transitions = None # TransitionMap
action = None # Action action = None # Action
action_priority = None # integer action_priority = None # integer
number = 0 # for debug output number = 0 # for debug output
epsilon_closure = None # used by nfa_to_dfa() epsilon_closure = None # used by nfa_to_dfa()
def __init__(self): def __init__(self):
# Preinitialise the list of empty transitions, because # Preinitialise the list of empty transitions, because
# the nfa-to-dfa algorithm needs it # the nfa-to-dfa algorithm needs it
#self.transitions = {'':[]} #self.transitions = {'':[]}
self.transitions = TransitionMap() self.transitions = TransitionMap()
self.action_priority = LOWEST_PRIORITY self.action_priority = LOWEST_PRIORITY
def destroy(self): def destroy(self):
#print "Destroying", self ### #print "Destroying", self ###
self.transitions = None self.transitions = None
self.action = None self.action = None
self.epsilon_closure = None self.epsilon_closure = None
def add_transition(self, event, new_state): def add_transition(self, event, new_state):
self.transitions.add(event, new_state) self.transitions.add(event, new_state)
def link_to(self, state): def link_to(self, state):
"""Add an epsilon-move from this state to another state.""" """Add an epsilon-move from this state to another state."""
self.add_transition('', state) self.add_transition('', state)
def set_action(self, action, priority): def set_action(self, action, priority):
"""Make this an accepting state with the given action. If """Make this an accepting state with the given action. If
there is already an action, choose the action with highest there is already an action, choose the action with highest
priority.""" priority."""
if priority > self.action_priority: if priority > self.action_priority:
self.action = action self.action = action
self.action_priority = priority self.action_priority = priority
def get_action(self): def get_action(self):
return self.action return self.action
def get_action_priority(self): def get_action_priority(self):
return self.action_priority return self.action_priority
def is_accepting(self): def is_accepting(self):
return self.action is not None return self.action is not None
def __str__(self): def __str__(self):
return "State %d" % self.number return "State %d" % self.number
def dump(self, file): def dump(self, file):
# Header # Header
file.write(" State %d:\n" % self.number) file.write(" State %d:\n" % self.number)
# Transitions # Transitions
# self.dump_transitions(file) # self.dump_transitions(file)
self.transitions.dump(file) self.transitions.dump(file)
# Action # Action
action = self.action action = self.action
priority = self.action_priority priority = self.action_priority
if action is not None: if action is not None:
file.write(" %s [priority %d]\n" % (action, priority)) file.write(" %s [priority %d]\n" % (action, priority))
def __lt__(self, other): def __lt__(self, other):
return self.number < other.number return self.number < other.number
class FastMachine(object): class FastMachine(object):
""" """
FastMachine is a deterministic machine represented in a way that FastMachine is a deterministic machine represented in a way that
allows fast scanning. allows fast scanning.
""" """
initial_states = None # {state_name:state} initial_states = None # {state_name:state}
states = None # [state] states = None # [state] where state = {event:state, 'else':state, 'action':Action}
# where state = {event:state, 'else':state, 'action':Action} next_number = 1 # for debugging
next_number = 1 # for debugging
new_state_template = {
new_state_template = { '': None, 'bol': None, 'eol': None, 'eof': None, 'else': None
'':None, 'bol':None, 'eol':None, 'eof':None, 'else':None }
}
def __init__(self, old_machine=None):
def __init__(self, old_machine = None): self.initial_states = initial_states = {}
self.initial_states = initial_states = {} self.states = []
self.states = [] if old_machine:
if old_machine: self.old_to_new = old_to_new = {}
self.old_to_new = old_to_new = {} for old_state in old_machine.states:
for old_state in old_machine.states: new_state = self.new_state()
new_state = self.new_state() old_to_new[old_state] = new_state
old_to_new[old_state] = new_state for name, old_state in old_machine.initial_states.iteritems():
for name, old_state in old_machine.initial_states.iteritems(): initial_states[name] = old_to_new[old_state]
initial_states[name] = old_to_new[old_state] for old_state in old_machine.states:
for old_state in old_machine.states: new_state = old_to_new[old_state]
new_state = old_to_new[old_state] for event, old_state_set in old_state.transitions.iteritems():
for event, old_state_set in old_state.transitions.iteritems(): if old_state_set:
if old_state_set: new_state[event] = old_to_new[old_state_set.keys()[0]]
new_state[event] = old_to_new[old_state_set.keys()[0]] else:
else: new_state[event] = None
new_state[event] = None new_state['action'] = old_state.action
new_state['action'] = old_state.action
def __del__(self):
def __del__(self): for state in self.states:
for state in self.states: state.clear()
state.clear()
def new_state(self, action=None):
def new_state(self, action = None): number = self.next_number
number = self.next_number self.next_number = number + 1
self.next_number = number + 1 result = self.new_state_template.copy()
result = self.new_state_template.copy() result['number'] = number
result['number'] = number result['action'] = action
result['action'] = action self.states.append(result)
self.states.append(result) return result
return result
def make_initial_state(self, name, state):
def make_initial_state(self, name, state): self.initial_states[name] = state
self.initial_states[name] = state
def add_transitions(self, state, event, new_state, maxint=sys.maxint):
def add_transitions(self, state, event, new_state, maxint=sys.maxint): if type(event) is tuple:
if type(event) is tuple: code0, code1 = event
code0, code1 = event if code0 == -maxint:
if code0 == -maxint: state['else'] = new_state
state['else'] = new_state elif code1 != maxint:
elif code1 != maxint: while code0 < code1:
while code0 < code1: state[unichr(code0)] = new_state
state[unichr(code0)] = new_state code0 += 1
code0 = code0 + 1 else:
else: state[event] = new_state
state[event] = new_state
def get_initial_state(self, name):
def get_initial_state(self, name): return self.initial_states[name]
return self.initial_states[name]
def dump(self, file):
def dump(self, file): file.write("Plex.FastMachine:\n")
file.write("Plex.FastMachine:\n") file.write(" Initial states:\n")
file.write(" Initial states:\n") for name, state in self.initial_states.iteritems():
for name, state in self.initial_states.iteritems(): file.write(" %s: %s\n" % (repr(name), state['number']))
file.write(" %s: %s\n" % (repr(name), state['number'])) for state in self.states:
for state in self.states: self.dump_state(state, file)
self.dump_state(state, file)
def dump_state(self, state, file):
def dump_state(self, state, file): # Header
# Header file.write(" State %d:\n" % state['number'])
file.write(" State %d:\n" % state['number']) # Transitions
# Transitions self.dump_transitions(state, file)
self.dump_transitions(state, file) # Action
# Action action = state['action']
action = state['action'] if action is not None:
if action is not None: file.write(" %s\n" % action)
file.write(" %s\n" % action)
def dump_transitions(self, state, file):
def dump_transitions(self, state, file): chars_leading_to_state = {}
chars_leading_to_state = {} special_to_state = {}
special_to_state = {} for (c, s) in state.iteritems():
for (c, s) in state.iteritems(): if len(c) == 1:
if len(c) == 1: chars = chars_leading_to_state.get(id(s), None)
chars = chars_leading_to_state.get(id(s), None) if chars is None:
if chars is None: chars = []
chars = [] chars_leading_to_state[id(s)] = chars
chars_leading_to_state[id(s)] = chars chars.append(c)
chars.append(c) elif len(c) <= 4:
elif len(c) <= 4: special_to_state[c] = s
special_to_state[c] = s ranges_to_state = {}
ranges_to_state = {} for state in self.states:
for state in self.states: char_list = chars_leading_to_state.get(id(state), None)
char_list = chars_leading_to_state.get(id(state), None) if char_list:
if char_list: ranges = self.chars_to_ranges(char_list)
ranges = self.chars_to_ranges(char_list) ranges_to_state[ranges] = state
ranges_to_state[ranges] = state ranges_list = ranges_to_state.keys()
ranges_list = ranges_to_state.keys() ranges_list.sort()
ranges_list.sort() for ranges in ranges_list:
for ranges in ranges_list: key = self.ranges_to_string(ranges)
key = self.ranges_to_string(ranges) state = ranges_to_state[ranges]
state = ranges_to_state[ranges] file.write(" %s --> State %d\n" % (key, state['number']))
file.write(" %s --> State %d\n" % (key, state['number'])) for key in ('bol', 'eol', 'eof', 'else'):
for key in ('bol', 'eol', 'eof', 'else'): state = special_to_state.get(key, None)
state = special_to_state.get(key, None) if state:
if state: file.write(" %s --> State %d\n" % (key, state['number']))
file.write(" %s --> State %d\n" % (key, state['number']))
def chars_to_ranges(self, char_list):
def chars_to_ranges(self, char_list): char_list.sort()
char_list.sort() i = 0
i = 0 n = len(char_list)
n = len(char_list) result = []
result = [] while i < n:
while i < n: c1 = ord(char_list[i])
c1 = ord(char_list[i]) c2 = c1
c2 = c1 i += 1
i = i + 1 while i < n and ord(char_list[i]) == c2 + 1:
while i < n and ord(char_list[i]) == c2 + 1: i += 1
i = i + 1 c2 += 1
c2 = c2 + 1 result.append((chr(c1), chr(c2)))
result.append((chr(c1), chr(c2))) return tuple(result)
return tuple(result)
def ranges_to_string(self, range_list):
def ranges_to_string(self, range_list): return ','.join(map(self.range_to_string, range_list))
return ','.join(map(self.range_to_string, range_list))
def range_to_string(self, range_tuple):
def range_to_string(self, range_tuple): (c1, c2) = range_tuple
(c1, c2) = range_tuple if c1 == c2:
if c1 == c2: return repr(c1)
return repr(c1) else:
else: return "%s..%s" % (repr(c1), repr(c2))
return "%s..%s" % (repr(c1), repr(c2))
...@@ -42,14 +42,15 @@ def chars_to_ranges(s): ...@@ -42,14 +42,15 @@ def chars_to_ranges(s):
while i < n: while i < n:
code1 = ord(char_list[i]) code1 = ord(char_list[i])
code2 = code1 + 1 code2 = code1 + 1
i = i + 1 i += 1
while i < n and code2 >= ord(char_list[i]): while i < n and code2 >= ord(char_list[i]):
code2 = code2 + 1 code2 += 1
i = i + 1 i += 1
result.append(code1) result.append(code1)
result.append(code2) result.append(code2)
return result return result
def uppercase_range(code1, code2): def uppercase_range(code1, code2):
""" """
If the range of characters from code1 to code2-1 includes any If the range of characters from code1 to code2-1 includes any
...@@ -63,6 +64,7 @@ def uppercase_range(code1, code2): ...@@ -63,6 +64,7 @@ def uppercase_range(code1, code2):
else: else:
return None return None
def lowercase_range(code1, code2): def lowercase_range(code1, code2):
""" """
If the range of characters from code1 to code2-1 includes any If the range of characters from code1 to code2-1 includes any
...@@ -76,6 +78,7 @@ def lowercase_range(code1, code2): ...@@ -76,6 +78,7 @@ def lowercase_range(code1, code2):
else: else:
return None return None
def CodeRanges(code_list): def CodeRanges(code_list):
""" """
Given a list of codes as returned by chars_to_ranges, return Given a list of codes as returned by chars_to_ranges, return
...@@ -86,6 +89,7 @@ def CodeRanges(code_list): ...@@ -86,6 +89,7 @@ def CodeRanges(code_list):
re_list.append(CodeRange(code_list[i], code_list[i + 1])) re_list.append(CodeRange(code_list[i], code_list[i + 1]))
return Alt(*re_list) return Alt(*re_list)
def CodeRange(code1, code2): def CodeRange(code1, code2):
""" """
CodeRange(code1, code2) is an RE which matches any character CodeRange(code1, code2) is an RE which matches any character
...@@ -93,11 +97,12 @@ def CodeRange(code1, code2): ...@@ -93,11 +97,12 @@ def CodeRange(code1, code2):
""" """
if code1 <= nl_code < code2: if code1 <= nl_code < code2:
return Alt(RawCodeRange(code1, nl_code), return Alt(RawCodeRange(code1, nl_code),
RawNewline, RawNewline,
RawCodeRange(nl_code + 1, code2)) RawCodeRange(nl_code + 1, code2))
else: else:
return RawCodeRange(code1, code2) return RawCodeRange(code1, code2)
# #
# Abstract classes # Abstract classes
# #
...@@ -110,12 +115,12 @@ class RE(object): ...@@ -110,12 +115,12 @@ class RE(object):
re1 | re2 is an RE which matches either |re1| or |re2| re1 | re2 is an RE which matches either |re1| or |re2|
""" """
nullable = 1 # True if this RE can match 0 input symbols nullable = 1 # True if this RE can match 0 input symbols
match_nl = 1 # True if this RE can match a string ending with '\n' match_nl = 1 # True if this RE can match a string ending with '\n'
str = None # Set to a string to override the class's __str__ result str = None # Set to a string to override the class's __str__ result
def build_machine(self, machine, initial_state, final_state, def build_machine(self, machine, initial_state, final_state,
match_bol, nocase): match_bol, nocase):
""" """
This method should add states to |machine| to implement this This method should add states to |machine| to implement this
RE, starting at |initial_state| and ending at |final_state|. RE, starting at |initial_state| and ending at |final_state|.
...@@ -124,7 +129,7 @@ class RE(object): ...@@ -124,7 +129,7 @@ class RE(object):
letters should be treated as equivalent. letters should be treated as equivalent.
""" """
raise NotImplementedError("%s.build_machine not implemented" % raise NotImplementedError("%s.build_machine not implemented" %
self.__class__.__name__) self.__class__.__name__)
def build_opt(self, m, initial_state, c): def build_opt(self, m, initial_state, c):
""" """
...@@ -160,18 +165,18 @@ class RE(object): ...@@ -160,18 +165,18 @@ class RE(object):
self.check_string(num, value) self.check_string(num, value)
if len(value) != 1: if len(value) != 1:
raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
"Expected a string of length 1, got: %s" % ( "Expected a string of length 1, got: %s" % (
num, self.__class__.__name__, repr(value))) num, self.__class__.__name__, repr(value)))
def wrong_type(self, num, value, expected): def wrong_type(self, num, value, expected):
if type(value) == types.InstanceType: if type(value) == types.InstanceType:
got = "%s.%s instance" % ( got = "%s.%s instance" % (
value.__class__.__module__, value.__class__.__name__) value.__class__.__module__, value.__class__.__name__)
else: else:
got = type(value).__name__ got = type(value).__name__
raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
"(expected %s, got %s" % ( "(expected %s, got %s" % (
num, self.__class__.__name__, expected, got)) num, self.__class__.__name__, expected, got))
# #
# Primitive RE constructors # Primitive RE constructors
...@@ -211,6 +216,7 @@ class RE(object): ...@@ -211,6 +216,7 @@ class RE(object):
## def calc_str(self): ## def calc_str(self):
## return "Char(%s)" % repr(self.char) ## return "Char(%s)" % repr(self.char)
def Char(c): def Char(c):
""" """
Char(c) is an RE which matches the character |c|. Char(c) is an RE which matches the character |c|.
...@@ -222,6 +228,7 @@ def Char(c): ...@@ -222,6 +228,7 @@ def Char(c):
result.str = "Char(%s)" % repr(c) result.str = "Char(%s)" % repr(c)
return result return result
class RawCodeRange(RE): class RawCodeRange(RE):
""" """
RawCodeRange(code1, code2) is a low-level RE which matches any character RawCodeRange(code1, code2) is a low-level RE which matches any character
...@@ -230,9 +237,9 @@ class RawCodeRange(RE): ...@@ -230,9 +237,9 @@ class RawCodeRange(RE):
""" """
nullable = 0 nullable = 0
match_nl = 0 match_nl = 0
range = None # (code, code) range = None # (code, code)
uppercase_range = None # (code, code) or None uppercase_range = None # (code, code) or None
lowercase_range = None # (code, code) or None lowercase_range = None # (code, code) or None
def __init__(self, code1, code2): def __init__(self, code1, code2):
self.range = (code1, code2) self.range = (code1, code2)
...@@ -252,6 +259,7 @@ class RawCodeRange(RE): ...@@ -252,6 +259,7 @@ class RawCodeRange(RE):
def calc_str(self): def calc_str(self):
return "CodeRange(%d,%d)" % (self.code1, self.code2) return "CodeRange(%d,%d)" % (self.code1, self.code2)
class _RawNewline(RE): class _RawNewline(RE):
""" """
RawNewline is a low-level RE which matches a newline character. RawNewline is a low-level RE which matches a newline character.
...@@ -266,6 +274,7 @@ class _RawNewline(RE): ...@@ -266,6 +274,7 @@ class _RawNewline(RE):
s = self.build_opt(m, initial_state, EOL) s = self.build_opt(m, initial_state, EOL)
s.add_transition((nl_code, nl_code + 1), final_state) s.add_transition((nl_code, nl_code + 1), final_state)
RawNewline = _RawNewline() RawNewline = _RawNewline()
...@@ -304,7 +313,7 @@ class Seq(RE): ...@@ -304,7 +313,7 @@ class Seq(RE):
i = len(re_list) i = len(re_list)
match_nl = 0 match_nl = 0
while i: while i:
i = i - 1 i -= 1
re = re_list[i] re = re_list[i]
if re.match_nl: if re.match_nl:
match_nl = 1 match_nl = 1
...@@ -354,7 +363,7 @@ class Alt(RE): ...@@ -354,7 +363,7 @@ class Alt(RE):
non_nullable_res.append(re) non_nullable_res.append(re)
if re.match_nl: if re.match_nl:
match_nl = 1 match_nl = 1
i = i + 1 i += 1
self.nullable_res = nullable_res self.nullable_res = nullable_res
self.non_nullable_res = non_nullable_res self.non_nullable_res = non_nullable_res
self.nullable = nullable self.nullable = nullable
...@@ -411,7 +420,7 @@ class SwitchCase(RE): ...@@ -411,7 +420,7 @@ class SwitchCase(RE):
def build_machine(self, m, initial_state, final_state, match_bol, nocase): def build_machine(self, m, initial_state, final_state, match_bol, nocase):
self.re.build_machine(m, initial_state, final_state, match_bol, self.re.build_machine(m, initial_state, final_state, match_bol,
self.nocase) self.nocase)
def calc_str(self): def calc_str(self):
if self.nocase: if self.nocase:
...@@ -434,6 +443,7 @@ Empty.__doc__ = \ ...@@ -434,6 +443,7 @@ Empty.__doc__ = \
""" """
Empty.str = "Empty" Empty.str = "Empty"
def Str1(s): def Str1(s):
""" """
Str1(s) is an RE which matches the literal string |s|. Str1(s) is an RE which matches the literal string |s|.
...@@ -442,6 +452,7 @@ def Str1(s): ...@@ -442,6 +452,7 @@ def Str1(s):
result.str = "Str(%s)" % repr(s) result.str = "Str(%s)" % repr(s)
return result return result
def Str(*strs): def Str(*strs):
""" """
Str(s) is an RE which matches the literal string |s|. Str(s) is an RE which matches the literal string |s|.
...@@ -454,6 +465,7 @@ def Str(*strs): ...@@ -454,6 +465,7 @@ def Str(*strs):
result.str = "Str(%s)" % ','.join(map(repr, strs)) result.str = "Str(%s)" % ','.join(map(repr, strs))
return result return result
def Any(s): def Any(s):
""" """
Any(s) is an RE which matches any character in the string |s|. Any(s) is an RE which matches any character in the string |s|.
...@@ -463,6 +475,7 @@ def Any(s): ...@@ -463,6 +475,7 @@ def Any(s):
result.str = "Any(%s)" % repr(s) result.str = "Any(%s)" % repr(s)
return result return result
def AnyBut(s): def AnyBut(s):
""" """
AnyBut(s) is an RE which matches any character (including AnyBut(s) is an RE which matches any character (including
...@@ -475,6 +488,7 @@ def AnyBut(s): ...@@ -475,6 +488,7 @@ def AnyBut(s):
result.str = "AnyBut(%s)" % repr(s) result.str = "AnyBut(%s)" % repr(s)
return result return result
AnyChar = AnyBut("") AnyChar = AnyBut("")
AnyChar.__doc__ = \ AnyChar.__doc__ = \
""" """
...@@ -482,7 +496,8 @@ AnyChar.__doc__ = \ ...@@ -482,7 +496,8 @@ AnyChar.__doc__ = \
""" """
AnyChar.str = "AnyChar" AnyChar.str = "AnyChar"
def Range(s1, s2 = None):
def Range(s1, s2=None):
""" """
Range(c1, c2) is an RE which matches any single character in the range Range(c1, c2) is an RE which matches any single character in the range
|c1| to |c2| inclusive. |c1| to |c2| inclusive.
...@@ -495,11 +510,12 @@ def Range(s1, s2 = None): ...@@ -495,11 +510,12 @@ def Range(s1, s2 = None):
else: else:
ranges = [] ranges = []
for i in range(0, len(s1), 2): for i in range(0, len(s1), 2):
ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1)) ranges.append(CodeRange(ord(s1[i]), ord(s1[i + 1]) + 1))
result = Alt(*ranges) result = Alt(*ranges)
result.str = "Range(%s)" % repr(s1) result.str = "Range(%s)" % repr(s1)
return result return result
def Opt(re): def Opt(re):
""" """
Opt(re) is an RE which matches either |re| or the empty string. Opt(re) is an RE which matches either |re| or the empty string.
...@@ -508,6 +524,7 @@ def Opt(re): ...@@ -508,6 +524,7 @@ def Opt(re):
result.str = "Opt(%s)" % re result.str = "Opt(%s)" % re
return result return result
def Rep(re): def Rep(re):
""" """
Rep(re) is an RE which matches zero or more repetitions of |re|. Rep(re) is an RE which matches zero or more repetitions of |re|.
...@@ -516,12 +533,14 @@ def Rep(re): ...@@ -516,12 +533,14 @@ def Rep(re):
result.str = "Rep(%s)" % re result.str = "Rep(%s)" % re
return result return result
def NoCase(re): def NoCase(re):
""" """
NoCase(re) is an RE which matches the same strings as RE, but treating NoCase(re) is an RE which matches the same strings as RE, but treating
upper and lower case letters as equivalent. upper and lower case letters as equivalent.
""" """
return SwitchCase(re, nocase = 1) return SwitchCase(re, nocase=1)
def Case(re): def Case(re):
""" """
...@@ -529,7 +548,7 @@ def Case(re): ...@@ -529,7 +548,7 @@ def Case(re):
upper and lower case letters as distinct, i.e. it cancels the effect upper and lower case letters as distinct, i.e. it cancels the effect
of any enclosing NoCase(). of any enclosing NoCase().
""" """
return SwitchCase(re, nocase = 0) return SwitchCase(re, nocase=0)
# #
# RE Constants # RE Constants
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import cython import cython
cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object)
from . import Errors from . import Errors
...@@ -19,317 +20,318 @@ NOT_FOUND = object() ...@@ -19,317 +20,318 @@ NOT_FOUND = object()
class Scanner(object): class Scanner(object):
""" """
A Scanner is used to read tokens from a stream of characters A Scanner is used to read tokens from a stream of characters
using the token set specified by a Plex.Lexicon. using the token set specified by a Plex.Lexicon.
Constructor:
Scanner(lexicon, stream, name = '')
See the docstring of the __init__ method for details. Constructor:
Methods: Scanner(lexicon, stream, name = '')
See the docstrings of the individual methods for more See the docstring of the __init__ method for details.
information.
read() --> (value, text) Methods:
Reads the next lexical token from the stream.
position() --> (name, line, col) See the docstrings of the individual methods for more
Returns the position of the last token read using the information.
read() method.
begin(state_name) read() --> (value, text)
Causes scanner to change state. Reads the next lexical token from the stream.
produce(value [, text]) position() --> (name, line, col)
Causes return of a token value to the caller of the Returns the position of the last token read using the
Scanner. read() method.
""" begin(state_name)
Causes scanner to change state.
# lexicon = None # Lexicon produce(value [, text])
# stream = None # file-like object Causes return of a token value to the caller of the
# name = '' Scanner.
# buffer = ''
# buf_start_pos = 0 # position in input of start of buffer
# next_pos = 0 # position in input of next char to read
# cur_pos = 0 # position in input of current char
# cur_line = 1 # line number of current char
# cur_line_start = 0 # position in input of start of current line
# start_pos = 0 # position in input of start of token
# start_line = 0 # line number of start of token
# start_col = 0 # position in line of start of token
# text = None # text of last token read
# initial_state = None # Node
# state_name = '' # Name of initial state
# queue = None # list of tokens to be returned
# trace = 0
def __init__(self, lexicon, stream, name = '', initial_pos = None):
""" """
Scanner(lexicon, stream, name = '')
|lexicon| is a Plex.Lexicon instance specifying the lexical tokens # lexicon = None # Lexicon
to be recognised. # stream = None # file-like object
# name = ''
|stream| can be a file object or anything which implements a # buffer = ''
compatible read() method. # buf_start_pos = 0 # position in input of start of buffer
# next_pos = 0 # position in input of next char to read
|name| is optional, and may be the name of the file being # cur_pos = 0 # position in input of current char
scanned or any other identifying string. # cur_line = 1 # line number of current char
""" # cur_line_start = 0 # position in input of start of current line
self.trace = 0 # start_pos = 0 # position in input of start of token
# start_line = 0 # line number of start of token
self.buffer = u'' # start_col = 0 # position in line of start of token
self.buf_start_pos = 0 # text = None # text of last token read
self.next_pos = 0 # initial_state = None # Node
self.cur_pos = 0 # state_name = '' # Name of initial state
self.cur_line = 1 # queue = None # list of tokens to be returned
self.start_pos = 0 # trace = 0
self.start_line = 0
self.start_col = 0 def __init__(self, lexicon, stream, name='', initial_pos=None):
self.text = None """
self.state_name = None Scanner(lexicon, stream, name = '')
self.lexicon = lexicon |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
self.stream = stream to be recognised.
self.name = name
self.queue = [] |stream| can be a file object or anything which implements a
self.initial_state = None compatible read() method.
self.begin('')
self.next_pos = 0 |name| is optional, and may be the name of the file being
self.cur_pos = 0 scanned or any other identifying string.
self.cur_line_start = 0 """
self.cur_char = BOL self.trace = 0
self.input_state = 1
if initial_pos is not None: self.buffer = u''
self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2] self.buf_start_pos = 0
self.next_pos = 0
def read(self): self.cur_pos = 0
""" self.cur_line = 1
Read the next lexical token from the stream and return a self.start_pos = 0
tuple (value, text), where |value| is the value associated with self.start_line = 0
the token as specified by the Lexicon, and |text| is the actual self.start_col = 0
string read from the stream. Returns (None, '') on end of file. self.text = None
""" self.state_name = None
queue = self.queue
while not queue: self.lexicon = lexicon
self.text, action = self.scan_a_token() self.stream = stream
if action is None: self.name = name
self.produce(None) self.queue = []
self.eof() self.initial_state = None
else: self.begin('')
value = action.perform(self, self.text) self.next_pos = 0
if value is not None: self.cur_pos = 0
self.produce(value) self.cur_line_start = 0
result = queue[0] self.cur_char = BOL
del queue[0] self.input_state = 1
return result if initial_pos is not None:
self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
def scan_a_token(self):
""" def read(self):
Read the next input sequence recognised by the machine """
and return (text, action). Returns ('', None) on end of Read the next lexical token from the stream and return a
file. tuple (value, text), where |value| is the value associated with
""" the token as specified by the Lexicon, and |text| is the actual
self.start_pos = self.cur_pos string read from the stream. Returns (None, '') on end of file.
self.start_line = self.cur_line """
self.start_col = self.cur_pos - self.cur_line_start queue = self.queue
action = self.run_machine_inlined() while not queue:
if action is not None: self.text, action = self.scan_a_token()
if self.trace: if action is None:
print("Scanner: read: Performing %s %d:%d" % ( self.produce(None)
action, self.start_pos, self.cur_pos)) self.eof()
text = self.buffer[self.start_pos - self.buf_start_pos : else:
self.cur_pos - self.buf_start_pos] value = action.perform(self, self.text)
return (text, action) if value is not None:
else: self.produce(value)
if self.cur_pos == self.start_pos: result = queue[0]
if self.cur_char is EOL: del queue[0]
self.next_char() return result
if self.cur_char is None or self.cur_char is EOF:
return (u'', None) def scan_a_token(self):
raise Errors.UnrecognizedInput(self, self.state_name) """
Read the next input sequence recognised by the machine
def run_machine_inlined(self): and return (text, action). Returns ('', None) on end of
""" file.
Inlined version of run_machine for speed. """
""" self.start_pos = self.cur_pos
state = self.initial_state self.start_line = self.cur_line
cur_pos = self.cur_pos self.start_col = self.cur_pos - self.cur_line_start
cur_line = self.cur_line action = self.run_machine_inlined()
cur_line_start = self.cur_line_start if action is not None:
cur_char = self.cur_char if self.trace:
input_state = self.input_state print("Scanner: read: Performing %s %d:%d" % (
next_pos = self.next_pos action, self.start_pos, self.cur_pos))
buffer = self.buffer text = self.buffer[
buf_start_pos = self.buf_start_pos self.start_pos - self.buf_start_pos:
buf_len = len(buffer) self.cur_pos - self.buf_start_pos]
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ return (text, action)
None, 0, 0, 0, u'', 0, 0 else:
trace = self.trace if self.cur_pos == self.start_pos:
while 1: if self.cur_char is EOL:
if trace: #TRACE# self.next_char()
print("State %d, %d/%d:%s -->" % ( #TRACE# if self.cur_char is None or self.cur_char is EOF:
state['number'], input_state, cur_pos, repr(cur_char))) #TRACE# return (u'', None)
# Begin inlined self.save_for_backup() raise Errors.UnrecognizedInput(self, self.state_name)
#action = state.action #@slow
action = state['action'] #@fast def run_machine_inlined(self):
if action is not None: """
Inlined version of run_machine for speed.
"""
state = self.initial_state
cur_pos = self.cur_pos
cur_line = self.cur_line
cur_line_start = self.cur_line_start
cur_char = self.cur_char
input_state = self.input_state
next_pos = self.next_pos
buffer = self.buffer
buf_start_pos = self.buf_start_pos
buf_len = len(buffer)
b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos None, 0, 0, 0, u'', 0, 0
# End inlined self.save_for_backup() trace = self.trace
c = cur_char while 1:
#new_state = state.new_state(c) #@slow if trace: #TRACE#
new_state = state.get(c, NOT_FOUND) #@fast print("State %d, %d/%d:%s -->" % ( #TRACE#
if new_state is NOT_FOUND: #@fast state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
new_state = c and state.get('else') #@fast # Begin inlined self.save_for_backup()
if new_state: #action = state.action #@slow
if trace: #TRACE# action = state['action'] #@fast
print("State %d" % new_state['number']) #TRACE# if action is not None:
state = new_state b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \
# Begin inlined: self.next_char() action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos
# End inlined self.save_for_backup()
c = cur_char
#new_state = state.new_state(c) #@slow
new_state = state.get(c, NOT_FOUND) #@fast
if new_state is NOT_FOUND: #@fast
new_state = c and state.get('else') #@fast
if new_state:
if trace: #TRACE#
print("State %d" % new_state['number']) #TRACE#
state = new_state
# Begin inlined: self.next_char()
if input_state == 1:
cur_pos = next_pos
# Begin inlined: c = self.read_char()
buf_index = next_pos - buf_start_pos
if buf_index < buf_len:
c = buffer[buf_index]
next_pos += 1
else:
discard = self.start_pos - buf_start_pos
data = self.stream.read(0x1000)
buffer = self.buffer[discard:] + data
self.buffer = buffer
buf_start_pos += discard
self.buf_start_pos = buf_start_pos
buf_len = len(buffer)
buf_index -= discard
if data:
c = buffer[buf_index]
next_pos += 1
else:
c = u''
# End inlined: c = self.read_char()
if c == u'\n':
cur_char = EOL
input_state = 2
elif not c:
cur_char = EOL
input_state = 4
else:
cur_char = c
elif input_state == 2:
cur_char = u'\n'
input_state = 3
elif input_state == 3:
cur_line += 1
cur_line_start = cur_pos = next_pos
cur_char = BOL
input_state = 1
elif input_state == 4:
cur_char = EOF
input_state = 5
else: # input_state = 5
cur_char = u''
# End inlined self.next_char()
else: # not new_state
if trace: #TRACE#
print("blocked") #TRACE#
# Begin inlined: action = self.back_up()
if b_action is not None:
(action, cur_pos, cur_line, cur_line_start,
cur_char, input_state, next_pos) = \
(b_action, b_cur_pos, b_cur_line, b_cur_line_start,
b_cur_char, b_input_state, b_next_pos)
else:
action = None
break # while 1
# End inlined: action = self.back_up()
self.cur_pos = cur_pos
self.cur_line = cur_line
self.cur_line_start = cur_line_start
self.cur_char = cur_char
self.input_state = input_state
self.next_pos = next_pos
if trace: #TRACE#
if action is not None: #TRACE#
print("Doing %s" % action) #TRACE#
return action
def next_char(self):
input_state = self.input_state
if self.trace:
print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos))
if input_state == 1: if input_state == 1:
cur_pos = next_pos self.cur_pos = self.next_pos
# Begin inlined: c = self.read_char() c = self.read_char()
buf_index = next_pos - buf_start_pos if c == u'\n':
if buf_index < buf_len: self.cur_char = EOL
c = buffer[buf_index] self.input_state = 2
next_pos = next_pos + 1 elif not c:
else: self.cur_char = EOL
discard = self.start_pos - buf_start_pos self.input_state = 4
data = self.stream.read(0x1000)
buffer = self.buffer[discard:] + data
self.buffer = buffer
buf_start_pos = buf_start_pos + discard
self.buf_start_pos = buf_start_pos
buf_len = len(buffer)
buf_index = buf_index - discard
if data:
c = buffer[buf_index]
next_pos = next_pos + 1
else: else:
c = u'' self.cur_char = c
# End inlined: c = self.read_char()
if c == u'\n':
cur_char = EOL
input_state = 2
elif not c:
cur_char = EOL
input_state = 4
else:
cur_char = c
elif input_state == 2: elif input_state == 2:
cur_char = u'\n' self.cur_char = u'\n'
input_state = 3 self.input_state = 3
elif input_state == 3: elif input_state == 3:
cur_line = cur_line + 1 self.cur_line += 1
cur_line_start = cur_pos = next_pos self.cur_line_start = self.cur_pos = self.next_pos
cur_char = BOL self.cur_char = BOL
input_state = 1 self.input_state = 1
elif input_state == 4: elif input_state == 4:
cur_char = EOF self.cur_char = EOF
input_state = 5 self.input_state = 5
else: # input_state = 5 else: # input_state = 5
cur_char = u'' self.cur_char = u''
# End inlined self.next_char() if self.trace:
else: # not new_state print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
if trace: #TRACE#
print("blocked") #TRACE# def position(self):
# Begin inlined: action = self.back_up() """
if b_action is not None: Return a tuple (name, line, col) representing the location of
(action, cur_pos, cur_line, cur_line_start, the last token read using the read() method. |name| is the
cur_char, input_state, next_pos) = \ name that was provided to the Scanner constructor; |line|
(b_action, b_cur_pos, b_cur_line, b_cur_line_start, is the line number in the stream (1-based); |col| is the
b_cur_char, b_input_state, b_next_pos) position within the line of the first character of the token
else: (0-based).
action = None """
break # while 1 return (self.name, self.start_line, self.start_col)
# End inlined: action = self.back_up()
self.cur_pos = cur_pos def get_position(self):
self.cur_line = cur_line """Python accessible wrapper around position(), only for error reporting.
self.cur_line_start = cur_line_start """
self.cur_char = cur_char return self.position()
self.input_state = input_state
self.next_pos = next_pos def begin(self, state_name):
if trace: #TRACE# """Set the current state of the scanner to the named state."""
if action is not None: #TRACE# self.initial_state = (
print("Doing %s" % action) #TRACE# self.lexicon.get_initial_state(state_name))
return action self.state_name = state_name
def next_char(self): def produce(self, value, text=None):
input_state = self.input_state """
if self.trace: Called from an action procedure, causes |value| to be returned
print("Scanner: next: %s [%d] %d" % (" "*20, input_state, self.cur_pos)) as the token value from read(). If |text| is supplied, it is
if input_state == 1: returned in place of the scanned text.
self.cur_pos = self.next_pos
c = self.read_char() produce() can be called more than once during a single call to an action
if c == u'\n': procedure, in which case the tokens are queued up and returned one
self.cur_char = EOL at a time by subsequent calls to read(), until the queue is empty,
self.input_state = 2 whereupon scanning resumes.
elif not c: """
self.cur_char = EOL if text is None:
self.input_state = 4 text = self.text
else: self.queue.append((value, text))
self.cur_char = c
elif input_state == 2: def eof(self):
self.cur_char = u'\n' """
self.input_state = 3 Override this method if you want something to be done at
elif input_state == 3: end of file.
self.cur_line = self.cur_line + 1 """
self.cur_line_start = self.cur_pos = self.next_pos
self.cur_char = BOL
self.input_state = 1
elif input_state == 4:
self.cur_char = EOF
self.input_state = 5
else: # input_state = 5
self.cur_char = u''
if self.trace:
print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
def position(self):
"""
Return a tuple (name, line, col) representing the location of
the last token read using the read() method. |name| is the
name that was provided to the Scanner constructor; |line|
is the line number in the stream (1-based); |col| is the
position within the line of the first character of the token
(0-based).
"""
return (self.name, self.start_line, self.start_col)
def get_position(self):
"""Python accessible wrapper around position(), only for error reporting.
"""
return self.position()
def begin(self, state_name):
"""Set the current state of the scanner to the named state."""
self.initial_state = (
self.lexicon.get_initial_state(state_name))
self.state_name = state_name
def produce(self, value, text = None):
"""
Called from an action procedure, causes |value| to be returned
as the token value from read(). If |text| is supplied, it is
returned in place of the scanned text.
produce() can be called more than once during a single call to an action
procedure, in which case the tokens are queued up and returned one
at a time by subsequent calls to read(), until the queue is empty,
whereupon scanning resumes.
"""
if text is None:
text = self.text
self.queue.append((value, text))
def eof(self):
"""
Override this method if you want something to be done at
end of file.
"""
...@@ -13,147 +13,146 @@ from .Errors import PlexError ...@@ -13,147 +13,146 @@ from .Errors import PlexError
class RegexpSyntaxError(PlexError): class RegexpSyntaxError(PlexError):
pass pass
def re(s): def re(s):
""" """
Convert traditional string representation of regular expression |s| Convert traditional string representation of regular expression |s|
into Plex representation. into Plex representation.
""" """
return REParser(s).parse_re() return REParser(s).parse_re()
class REParser(object): class REParser(object):
def __init__(self, s):
def __init__(self, s): self.s = s
self.s = s self.i = -1
self.i = -1 self.end = 0
self.end = 0
self.next()
def parse_re(self):
re = self.parse_alt()
if not self.end:
self.error("Unexpected %s" % repr(self.c))
return re
def parse_alt(self):
"""Parse a set of alternative regexps."""
re = self.parse_seq()
if self.c == '|':
re_list = [re]
while self.c == '|':
self.next() self.next()
re_list.append(self.parse_seq())
re = Alt(*re_list) def parse_re(self):
return re re = self.parse_alt()
if not self.end:
def parse_seq(self): self.error("Unexpected %s" % repr(self.c))
"""Parse a sequence of regexps.""" return re
re_list = []
while not self.end and not self.c in "|)": def parse_alt(self):
re_list.append(self.parse_mod()) """Parse a set of alternative regexps."""
return Seq(*re_list) re = self.parse_seq()
if self.c == '|':
def parse_mod(self): re_list = [re]
"""Parse a primitive regexp followed by *, +, ? modifiers.""" while self.c == '|':
re = self.parse_prim() self.next()
while not self.end and self.c in "*+?": re_list.append(self.parse_seq())
if self.c == '*': re = Alt(*re_list)
re = Rep(re) return re
elif self.c == '+':
re = Rep1(re) def parse_seq(self):
else: # self.c == '?' """Parse a sequence of regexps."""
re = Opt(re) re_list = []
self.next() while not self.end and not self.c in "|)":
return re re_list.append(self.parse_mod())
return Seq(*re_list)
def parse_prim(self):
"""Parse a primitive regexp.""" def parse_mod(self):
c = self.get() """Parse a primitive regexp followed by *, +, ? modifiers."""
if c == '.': re = self.parse_prim()
re = AnyBut("\n") while not self.end and self.c in "*+?":
elif c == '^': if self.c == '*':
re = Bol re = Rep(re)
elif c == '$': elif self.c == '+':
re = Eol re = Rep1(re)
elif c == '(': else: # self.c == '?'
re = self.parse_alt() re = Opt(re)
self.expect(')') self.next()
elif c == '[': return re
re = self.parse_charset()
self.expect(']') def parse_prim(self):
else: """Parse a primitive regexp."""
if c == '\\':
c = self.get() c = self.get()
re = Char(c) if c == '.':
return re re = AnyBut("\n")
elif c == '^':
def parse_charset(self): re = Bol
"""Parse a charset. Does not include the surrounding [].""" elif c == '$':
char_list = [] re = Eol
invert = 0 elif c == '(':
if self.c == '^': re = self.parse_alt()
invert = 1 self.expect(')')
self.next() elif c == '[':
if self.c == ']': re = self.parse_charset()
char_list.append(']') self.expect(']')
self.next() else:
while not self.end and self.c != ']': if c == '\\':
c1 = self.get() c = self.get()
if self.c == '-' and self.lookahead(1) != ']': re = Char(c)
return re
def parse_charset(self):
"""Parse a charset. Does not include the surrounding []."""
char_list = []
invert = 0
if self.c == '^':
invert = 1
self.next()
if self.c == ']':
char_list.append(']')
self.next()
while not self.end and self.c != ']':
c1 = self.get()
if self.c == '-' and self.lookahead(1) != ']':
self.next()
c2 = self.get()
for a in xrange(ord(c1), ord(c2) + 1):
char_list.append(chr(a))
else:
char_list.append(c1)
chars = ''.join(char_list)
if invert:
return AnyBut(chars)
else:
return Any(chars)
def next(self):
"""Advance to the next char."""
s = self.s
i = self.i = self.i + 1
if i < len(s):
self.c = s[i]
else:
self.c = ''
self.end = 1
def get(self):
if self.end:
self.error("Premature end of string")
c = self.c
self.next() self.next()
c2 = self.get() return c
for a in xrange(ord(c1), ord(c2) + 1):
char_list.append(chr(a)) def lookahead(self, n):
else: """Look ahead n chars."""
char_list.append(c1) j = self.i + n
chars = ''.join(char_list) if j < len(self.s):
if invert: return self.s[j]
return AnyBut(chars) else:
else: return ''
return Any(chars)
def expect(self, c):
def next(self): """
"""Advance to the next char.""" Expect to find character |c| at current position.
s = self.s Raises an exception otherwise.
i = self.i = self.i + 1 """
if i < len(s): if self.c == c:
self.c = s[i] self.next()
else: else:
self.c = '' self.error("Missing %s" % repr(c))
self.end = 1
def error(self, mess):
def get(self): """Raise exception to signal syntax error in regexp."""
if self.end: raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
self.error("Premature end of string") repr(self.s), self.i, mess))
c = self.c
self.next()
return c
def lookahead(self, n):
"""Look ahead n chars."""
j = self.i + n
if j < len(self.s):
return self.s[j]
else:
return ''
def expect(self, c):
"""
Expect to find character |c| at current position.
Raises an exception otherwise.
"""
if self.c == c:
self.next()
else:
self.error("Missing %s" % repr(c))
def error(self, mess):
"""Raise exception to signal syntax error in regexp."""
raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
repr(self.s), self.i, mess))
# #
# Plex - Transition Maps # Plex - Transition Maps
# #
# This version represents state sets directly as dicts for speed. # This version represents state sets directly as dicts for speed.
# #
from __future__ import absolute_import from __future__ import absolute_import
...@@ -10,229 +10,231 @@ from sys import maxint as maxint ...@@ -10,229 +10,231 @@ from sys import maxint as maxint
class TransitionMap(object): class TransitionMap(object):
"""
A TransitionMap maps an input event to a set of states.
An input event is one of: a range of character codes,
the empty string (representing an epsilon move), or one
of the special symbols BOL, EOL, EOF.
For characters, this implementation compactly represents
the map by means of a list:
[code_0, states_0, code_1, states_1, code_2, states_2,
..., code_n-1, states_n-1, code_n]
where |code_i| is a character code, and |states_i| is a
set of states corresponding to characters with codes |c|
in the range |code_i| <= |c| <= |code_i+1|.
The following invariants hold:
n >= 1
code_0 == -maxint
code_n == maxint
code_i < code_i+1 for i in 0..n-1
states_0 == states_n-1
Mappings for the special events '', BOL, EOL, EOF are
kept separately in a dictionary.
"""
map = None # The list of codes and states
special = None # Mapping for special events
def __init__(self, map = None, special = None):
if not map:
map = [-maxint, {}, maxint]
if not special:
special = {}
self.map = map
self.special = special
#self.check() ###
def add(self, event, new_state,
TupleType = tuple):
""" """
Add transition to |new_state| on |event|. A TransitionMap maps an input event to a set of states.
An input event is one of: a range of character codes,
the empty string (representing an epsilon move), or one
of the special symbols BOL, EOL, EOF.
For characters, this implementation compactly represents
the map by means of a list:
[code_0, states_0, code_1, states_1, code_2, states_2,
..., code_n-1, states_n-1, code_n]
where |code_i| is a character code, and |states_i| is a
set of states corresponding to characters with codes |c|
in the range |code_i| <= |c| <= |code_i+1|.
The following invariants hold:
n >= 1
code_0 == -maxint
code_n == maxint
code_i < code_i+1 for i in 0..n-1
states_0 == states_n-1
Mappings for the special events '', BOL, EOL, EOF are
kept separately in a dictionary.
""" """
if type(event) is TupleType:
code0, code1 = event
i = self.split(code0)
j = self.split(code1)
map = self.map
while i < j:
map[i + 1][new_state] = 1
i = i + 2
else:
self.get_special(event)[new_state] = 1
def add_set(self, event, new_set,
TupleType = tuple):
"""
Add transitions to the states in |new_set| on |event|.
"""
if type(event) is TupleType:
code0, code1 = event
i = self.split(code0)
j = self.split(code1)
map = self.map
while i < j:
map[i + 1].update(new_set)
i = i + 2
else:
self.get_special(event).update(new_set)
def get_epsilon(self,
none = None):
"""
Return the mapping for epsilon, or None.
"""
return self.special.get('', none)
def iteritems(self, map = None # The list of codes and states
len = len): special = None # Mapping for special events
"""
Return the mapping as an iterable of ((code1, code2), state_set) and def __init__(self, map=None, special=None):
(special_event, state_set) pairs. if not map:
""" map = [-maxint, {}, maxint]
result = [] if not special:
map = self.map special = {}
else_set = map[1] self.map = map
i = 0 self.special = special
n = len(map) - 1 #self.check() ###
code0 = map[0]
while i < n: def add(self, event, new_state,
set = map[i + 1] TupleType=tuple):
code1 = map[i + 2] """
if set or else_set: Add transition to |new_state| on |event|.
result.append(((code0, code1), set)) """
code0 = code1 if type(event) is TupleType:
i = i + 2 code0, code1 = event
for event, set in self.special.iteritems(): i = self.split(code0)
if set: j = self.split(code1)
result.append((event, set)) map = self.map
return iter(result) while i < j:
items = iteritems map[i + 1][new_state] = 1
i += 2
# ------------------- Private methods -------------------- else:
self.get_special(event)[new_state] = 1
def split(self, code,
len = len, maxint = maxint): def add_set(self, event, new_set,
""" TupleType=tuple):
Search the list for the position of the split point for |code|, """
inserting a new split point if necessary. Returns index |i| such Add transitions to the states in |new_set| on |event|.
that |code| == |map[i]|. """
""" if type(event) is TupleType:
# We use a funky variation on binary search. code0, code1 = event
map = self.map i = self.split(code0)
hi = len(map) - 1 j = self.split(code1)
# Special case: code == map[-1] map = self.map
if code == maxint: while i < j:
return hi map[i + 1].update(new_set)
# General case i += 2
lo = 0 else:
# loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2 self.get_special(event).update(new_set)
while hi - lo >= 4:
# Find midpoint truncated to even index def get_epsilon(self,
mid = ((lo + hi) // 2) & ~1 none=None):
if code < map[mid]: """
hi = mid Return the mapping for epsilon, or None.
else: """
lo = mid return self.special.get('', none)
# map[lo] <= code < map[hi] and hi - lo == 2
if map[lo] == code: def iteritems(self,
return lo len=len):
else: """
map[hi:hi] = [code, map[hi - 1].copy()] Return the mapping as an iterable of ((code1, code2), state_set) and
#self.check() ### (special_event, state_set) pairs.
return hi """
result = []
def get_special(self, event): map = self.map
""" else_set = map[1]
Get state set for special event, adding a new entry if necessary. i = 0
""" n = len(map) - 1
special = self.special code0 = map[0]
set = special.get(event, None) while i < n:
if not set: set = map[i + 1]
set = {} code1 = map[i + 2]
special[event] = set if set or else_set:
return set result.append(((code0, code1), set))
code0 = code1
# --------------------- Conversion methods ----------------------- i += 2
for event, set in self.special.iteritems():
def __str__(self): if set:
map_strs = [] result.append((event, set))
map = self.map return iter(result)
n = len(map)
i = 0 items = iteritems
while i < n:
code = map[i] # ------------------- Private methods --------------------
if code == -maxint:
code_str = "-inf" def split(self, code,
elif code == maxint: len=len, maxint=maxint):
code_str = "inf" """
else: Search the list for the position of the split point for |code|,
code_str = str(code) inserting a new split point if necessary. Returns index |i| such
map_strs.append(code_str) that |code| == |map[i]|.
i = i + 1 """
if i < n: # We use a funky variation on binary search.
map_strs.append(state_set_str(map[i])) map = self.map
i = i + 1 hi = len(map) - 1
special_strs = {} # Special case: code == map[-1]
for event, set in self.special.iteritems(): if code == maxint:
special_strs[event] = state_set_str(set) return hi
return "[%s]+%s" % ( # General case
','.join(map_strs), lo = 0
special_strs # loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2
) while hi - lo >= 4:
# Find midpoint truncated to even index
# --------------------- Debugging methods ----------------------- mid = ((lo + hi) // 2) & ~1
if code < map[mid]:
def check(self): hi = mid
"""Check data structure integrity.""" else:
if not self.map[-3] < self.map[-1]: lo = mid
print(self) # map[lo] <= code < map[hi] and hi - lo == 2
assert 0 if map[lo] == code:
return lo
def dump(self, file):
map = self.map
i = 0
n = len(map) - 1
while i < n:
self.dump_range(map[i], map[i + 2], map[i + 1], file)
i = i + 2
for event, set in self.special.iteritems():
if set:
if not event:
event = 'empty'
self.dump_trans(event, set, file)
def dump_range(self, code0, code1, set, file):
if set:
if code0 == -maxint:
if code1 == maxint:
k = "any"
else: else:
k = "< %s" % self.dump_char(code1) map[hi:hi] = [code, map[hi - 1].copy()]
elif code1 == maxint: #self.check() ###
k = "> %s" % self.dump_char(code0 - 1) return hi
elif code0 == code1 - 1:
k = self.dump_char(code0) def get_special(self, event):
else: """
k = "%s..%s" % (self.dump_char(code0), Get state set for special event, adding a new entry if necessary.
self.dump_char(code1 - 1)) """
self.dump_trans(k, set, file) special = self.special
set = special.get(event, None)
def dump_char(self, code): if not set:
if 0 <= code <= 255: set = {}
return repr(chr(code)) special[event] = set
else: return set
return "chr(%d)" % code
# --------------------- Conversion methods -----------------------
def dump_trans(self, key, set, file):
file.write(" %s --> %s\n" % (key, self.dump_set(set))) def __str__(self):
map_strs = []
def dump_set(self, set): map = self.map
return state_set_str(set) n = len(map)
i = 0
while i < n:
code = map[i]
if code == -maxint:
code_str = "-inf"
elif code == maxint:
code_str = "inf"
else:
code_str = str(code)
map_strs.append(code_str)
i += 1
if i < n:
map_strs.append(state_set_str(map[i]))
i += 1
special_strs = {}
for event, set in self.special.iteritems():
special_strs[event] = state_set_str(set)
return "[%s]+%s" % (
','.join(map_strs),
special_strs
)
# --------------------- Debugging methods -----------------------
def check(self):
"""Check data structure integrity."""
if not self.map[-3] < self.map[-1]:
print(self)
assert 0
def dump(self, file):
map = self.map
i = 0
n = len(map) - 1
while i < n:
self.dump_range(map[i], map[i + 2], map[i + 1], file)
i += 2
for event, set in self.special.iteritems():
if set:
if not event:
event = 'empty'
self.dump_trans(event, set, file)
def dump_range(self, code0, code1, set, file):
if set:
if code0 == -maxint:
if code1 == maxint:
k = "any"
else:
k = "< %s" % self.dump_char(code1)
elif code1 == maxint:
k = "> %s" % self.dump_char(code0 - 1)
elif code0 == code1 - 1:
k = self.dump_char(code0)
else:
k = "%s..%s" % (self.dump_char(code0),
self.dump_char(code1 - 1))
self.dump_trans(k, set, file)
def dump_char(self, code):
if 0 <= code <= 255:
return repr(chr(code))
else:
return "chr(%d)" % code
def dump_trans(self, key, set, file):
file.write(" %s --> %s\n" % (key, self.dump_set(set)))
def dump_set(self, set):
return state_set_str(set)
# #
# State set manipulation functions # State set manipulation functions
...@@ -243,4 +245,4 @@ class TransitionMap(object): ...@@ -243,4 +245,4 @@ class TransitionMap(object):
# set1[state] = 1 # set1[state] = 1
def state_set_str(set): def state_set_str(set):
return "[%s]" % ','.join(["S%d" % state.number for state in set]) return "[%s]" % ','.join(["S%d" % state.number for state in set])
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment