104 lines
3 KiB
Python
104 lines
3 KiB
Python
"""
|
|
pygments.scanner
|
|
~~~~~~~~~~~~~~~~
|
|
|
|
This library implements a regex based scanner. Some languages
|
|
like Pascal are easy to parse but have some keywords that
|
|
depend on the context. Because of this it's impossible to lex
|
|
that just by using a regular expression lexer like the
|
|
`RegexLexer`.
|
|
|
|
Have a look at the `DelphiLexer` to get an idea of how to use
|
|
this scanner.
|
|
|
|
:copyright: Copyright 2006-2024 by the Pygments team, see AUTHORS.
|
|
:license: BSD, see LICENSE for details.
|
|
"""
|
|
import re
|
|
|
|
|
|
class EndOfText(RuntimeError):
|
|
"""
|
|
Raise if end of text is reached and the user
|
|
tried to call a match function.
|
|
"""
|
|
|
|
|
|
class Scanner:
|
|
"""
|
|
Simple scanner
|
|
|
|
All method patterns are regular expression strings (not
|
|
compiled expressions!)
|
|
"""
|
|
|
|
def __init__(self, text, flags=0):
|
|
"""
|
|
:param text: The text which should be scanned
|
|
:param flags: default regular expression flags
|
|
"""
|
|
self.data = text
|
|
self.data_length = len(text)
|
|
self.start_pos = 0
|
|
self.pos = 0
|
|
self.flags = flags
|
|
self.last = None
|
|
self.match = None
|
|
self._re_cache = {}
|
|
|
|
def eos(self):
|
|
"""`True` if the scanner reached the end of text."""
|
|
return self.pos >= self.data_length
|
|
eos = property(eos, eos.__doc__)
|
|
|
|
def check(self, pattern):
|
|
"""
|
|
Apply `pattern` on the current position and return
|
|
the match object. (Doesn't touch pos). Use this for
|
|
lookahead.
|
|
"""
|
|
if self.eos:
|
|
raise EndOfText()
|
|
if pattern not in self._re_cache:
|
|
self._re_cache[pattern] = re.compile(pattern, self.flags)
|
|
return self._re_cache[pattern].match(self.data, self.pos)
|
|
|
|
def test(self, pattern):
|
|
"""Apply a pattern on the current position and check
|
|
if it patches. Doesn't touch pos.
|
|
"""
|
|
return self.check(pattern) is not None
|
|
|
|
def scan(self, pattern):
|
|
"""
|
|
Scan the text for the given pattern and update pos/match
|
|
and related fields. The return value is a boolean that
|
|
indicates if the pattern matched. The matched value is
|
|
stored on the instance as ``match``, the last value is
|
|
stored as ``last``. ``start_pos`` is the position of the
|
|
pointer before the pattern was matched, ``pos`` is the
|
|
end position.
|
|
"""
|
|
if self.eos:
|
|
raise EndOfText()
|
|
if pattern not in self._re_cache:
|
|
self._re_cache[pattern] = re.compile(pattern, self.flags)
|
|
self.last = self.match
|
|
m = self._re_cache[pattern].match(self.data, self.pos)
|
|
if m is None:
|
|
return False
|
|
self.start_pos = m.start()
|
|
self.pos = m.end()
|
|
self.match = m.group()
|
|
return True
|
|
|
|
def get_char(self):
|
|
"""Scan exactly one char."""
|
|
self.scan('.')
|
|
|
|
def __repr__(self):
|
|
return '<%s %d/%d>' % (
|
|
self.__class__.__name__,
|
|
self.pos,
|
|
self.data_length
|
|
)
|