Source code for mwparserfromhell.parser.tokenizer

# -*- coding: utf-8  -*-
#
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
from math import log
import re

from . import contexts
from . import tokens
from ..compat import htmlentities

__all__ = ["Tokenizer"]

[docs]class BadRoute(Exception): """Raised internally when the current tokenization route is invalid.""" pass
[docs]class Tokenizer(object): """Creates a list of tokens from a string of wikicode.""" USES_C = False START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", "!", "\n", END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) def __init__(self): self._text = None self._head = 0 self._stacks = [] self._global = 0 self._depth = 0 self._cycles = 0 @property
[docs] def _stack(self): """The current token stack.""" return self._stacks[-1][0]
@property def _context(self): """The current token context.""" return self._stacks[-1][1] @_context.setter
[docs] def _context(self, value): self._stacks[-1][1] = value
@property def _textbuffer(self): """The current textbuffer.""" return self._stacks[-1][2] @_textbuffer.setter
[docs] def _textbuffer(self, value): self._stacks[-1][2] = value
[docs] def _push(self, context=0): """Add a new token stack, context, and textbuffer to the list.""" self._stacks.append([[], context, []]) self._depth += 1 self._cycles += 1
[docs] def _push_textbuffer(self): """Push the textbuffer onto the stack as a Text node and clear it.""" if self._textbuffer: self._stack.append(tokens.Text(text="".join(self._textbuffer))) self._textbuffer = []
[docs] def _pop(self, keep_context=False): """Pop the current stack/context/textbuffer, returing the stack. If *keep_context* is ``True``, then we will replace the underlying stack's context with the current stack's. """ self._push_textbuffer() self._depth -= 1 if keep_context: context = self._context stack = self._stacks.pop()[0] self._context = context return stack return self._stacks.pop()[0]
[docs] def _can_recurse(self): """Return whether or not our max recursion depth has been exceeded.""" return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES
[docs] def _fail_route(self): """Fail the current tokenization route. Discards the current stack/context/textbuffer and raises :py:exc:`~.BadRoute`. """ self._pop() raise BadRoute()
[docs] def _write(self, token): """Write a token to the end of the current token stack.""" self._push_textbuffer() self._stack.append(token)
[docs] def _write_first(self, token): """Write a token to the beginning of the current token stack.""" self._push_textbuffer() self._stack.insert(0, token)
[docs] def _write_text(self, text): """Write text to the current textbuffer.""" self._textbuffer.append(text)
[docs] def _write_all(self, tokenlist): """Write a series of tokens to the current stack at once.""" if tokenlist and isinstance(tokenlist[0], tokens.Text): self._write_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist)
[docs] def _write_text_then_stack(self, text): """Pop the current stack, write *text*, and then write the stack.""" stack = self._pop() self._write_text(text) if stack: self._write_all(stack) self._head -= 1
[docs] def _read(self, delta=0, wrap=False, strict=False): """Read the value at a relative point in the wikicode. The value is read from :py:attr:`self._head <_head>` plus the value of *delta* (which can be negative). If *wrap* is ``False``, we will not allow attempts to read from the end of the string if ``self._head + delta`` is negative. If *strict* is ``True``, the route will be failed (with :py:meth:`_fail_route`) if we try to read from past the end of the string; otherwise, :py:attr:`self.END <END>` is returned. If we try to read from before the start of the string, :py:attr:`self.START <START>` is returned. """ index = self._head + delta if index < 0 and (not wrap or abs(index) > len(self._text)): return self.START try: return self._text[index] except IndexError: if strict: self._fail_route() return self.END
[docs] def _parse_template_or_argument(self): """Parse a template or argument at the head of the wikicode string.""" self._head += 2 braces = 2 while self._read() == "{": self._head += 1 braces += 1 self._push() while braces: if braces == 1: return self._write_text_then_stack("{") if braces == 2: try: self._parse_template() except BadRoute: return self._write_text_then_stack("{{") break try: self._parse_argument() braces -= 3 except BadRoute: try: self._parse_template() braces -= 2 except BadRoute: return self._write_text_then_stack("{" * braces) if braces: self._head += 1 self._write_all(self._pop())
[docs] def _parse_template(self): """Parse a template at the head of the wikicode string.""" reset = self._head try: template = self._parse(contexts.TEMPLATE_NAME) except BadRoute: self._head = reset raise self._write_first(tokens.TemplateOpen()) self._write_all(template) self._write(tokens.TemplateClose())
[docs] def _parse_argument(self): """Parse an argument at the head of the wikicode string.""" reset = self._head try: argument = self._parse(contexts.ARGUMENT_NAME) except BadRoute: self._head = reset raise self._write_first(tokens.ArgumentOpen()) self._write_all(argument) self._write(tokens.ArgumentClose())
[docs] def _handle_template_param(self): """Handle a template parameter at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: self._context ^= contexts.TEMPLATE_NAME elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE elif self._context & contexts.TEMPLATE_PARAM_KEY: self._write_all(self._pop(keep_context=True)) self._context |= contexts.TEMPLATE_PARAM_KEY self._write(tokens.TemplateParamSeparator()) self._push(self._context)
[docs] def _handle_template_param_value(self): """Handle a template parameter's value at the head of the string.""" self._write_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals())
[docs] def _handle_template_end(self): """Handle the end of a template at the head of the string.""" if self._context & contexts.TEMPLATE_PARAM_KEY: self._write_all(self._pop(keep_context=True)) self._head += 1 return self._pop()
[docs] def _handle_argument_separator(self): """Handle the separator between an argument's name and default.""" self._context ^= contexts.ARGUMENT_NAME self._context |= contexts.ARGUMENT_DEFAULT self._write(tokens.ArgumentSeparator())
[docs] def _handle_argument_end(self): """Handle the end of an argument at the head of the string.""" self._head += 2 return self._pop()
[docs] def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING reset = self._head self._head += 1 best = 1 while self._read() == "=": best += 1 self._head += 1 context = contexts.HEADING_LEVEL_1 << min(best - 1, 5) try: title, level = self._parse(context) except BadRoute: self._head = reset + best - 1 self._write_text("=" * best) else: self._write(tokens.HeadingStart(level=level)) if level < best: self._write_text("=" * (best - level)) self._write_all(title) self._write(tokens.HeadingEnd()) finally: self._global ^= contexts.GL_HEADING
[docs] def _handle_heading_end(self): """Handle the end of a section heading at the head of the string.""" reset = self._head self._head += 1 best = 1 while self._read() == "=": best += 1 self._head += 1 current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1 level = min(current, min(best, 6)) try: # Try to check for a heading closure after this one after, after_level = self._parse(self._context) except BadRoute: if level < best: self._write_text("=" * (best - level)) self._head = reset + best - 1 return self._pop(), level else: # Found another closure self._write_text("=" * best) self._write_all(after) return self._pop(), after_level
[docs] def _really_parse_entity(self): """Actually parse an HTML entity and ensure that it is valid.""" self._write(tokens.HTMLEntityStart()) self._head += 1 this = self._read(strict=True) if this == "#": numeric = True self._write(tokens.HTMLEntityNumeric()) self._head += 1 this = self._read(strict=True) if this[0].lower() == "x": hexadecimal = True self._write(tokens.HTMLEntityHex(char=this[0])) this = this[1:] if not this: self._fail_route() else: hexadecimal = False else: numeric = hexadecimal = False valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789" if not numeric and not hexadecimal: valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" if not all([char in valid for char in this]): self._fail_route() self._head += 1 if self._read() != ";": self._fail_route() if numeric: test = int(this, 16) if hexadecimal else int(this) if test < 1 or test > 0x10FFFF: self._fail_route() else: if this not in htmlentities.entitydefs: self._fail_route() self._write(tokens.Text(text=this)) self._write(tokens.HTMLEntityEnd())
[docs] def _parse_entity(self): """Parse an HTML entity at the head of the wikicode string.""" reset = self._head self._push() try: self._really_parse_entity() except BadRoute: self._head = reset self._write_text(self._read()) else: self._write_all(self._pop())
[docs] def _parse_comment(self): """Parse an HTML comment at the head of the wikicode string.""" self._head += 4 reset = self._head - 1 try: comment = self._parse(contexts.COMMENT) except BadRoute: self._head = reset self._write_text("<!--") else: self._write(tokens.CommentStart()) self._write_all(comment) self._write(tokens.CommentEnd()) self._head += 2
[docs] def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context if context & contexts.FAIL_NEXT: return False if context & contexts.WIKILINK_TITLE: if this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False return True if context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT return True if this == "]": return False if this == "|": return True if context & contexts.HAS_TEXT: if context & contexts.FAIL_ON_TEXT: if this is self.END or not this.isspace(): return False else: if this == "\n": self._context |= contexts.FAIL_ON_TEXT elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True else: if context & contexts.FAIL_ON_EQUALS: if this == "=": return False elif context & contexts.FAIL_ON_LBRACE: if this == "{" or (self._read(-1) == self._read(-2) == "{"): if context & contexts.TEMPLATE: self._context |= contexts.FAIL_ON_EQUALS else: self._context |= contexts.FAIL_NEXT return True self._context ^= contexts.FAIL_ON_LBRACE elif context & contexts.FAIL_ON_RBRACE: if this == "}": if context & contexts.TEMPLATE: self._context |= contexts.FAIL_ON_EQUALS else: self._context |= contexts.FAIL_NEXT return True self._context ^= contexts.FAIL_ON_RBRACE elif this == "{": self._context |= contexts.FAIL_ON_LBRACE elif this == "}": self._context |= contexts.FAIL_ON_RBRACE return True
[docs] def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" self._push(context) while True: this = self._read() unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME) if self._context & unsafe: if not self._verify_safe(this): if self._context & contexts.TEMPLATE_PARAM_KEY: self._pop() self._fail_route() if this not in self.MARKERS: self._write_text(this) self._head += 1 continue if this is self.END: fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | contexts.COMMENT) if self._context & contexts.TEMPLATE_PARAM_KEY: self._pop() if self._context & fail: self._fail_route() return self._pop() next = self._read(1) if self._context & contexts.COMMENT: if this == next == "-" and self._read(2) == ">": return self._pop() else: self._write_text(this) elif this == next == "{": if self._can_recurse(): self._parse_template_or_argument() if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT else: self._write_text("{") elif this == "|" and self._context & contexts.TEMPLATE: self._handle_template_param() elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() elif this == next == "}" and self._context & contexts.TEMPLATE: return self._handle_template_end() elif this == "|" and self._context & contexts.ARGUMENT_NAME: self._handle_argument_separator() elif this == next == "}" and self._context & contexts.ARGUMENT: if self._read(2) == "}": return self._handle_argument_end() else: self._write_text("}") elif this == next == "[": if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): self._parse_wikilink() if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT else: self._write_text("[") elif this == "|" and self._context & contexts.WIKILINK_TITLE: self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: return self._handle_heading_end() elif this == "\n" and self._context & contexts.HEADING: self._fail_route() elif this == "&": self._parse_entity() elif this == "<" and next == "!": if self._read(2) == self._read(3) == "-": self._parse_comment() else: self._write_text(this) else: self._write_text(this) self._head += 1
[docs] def tokenize(self, text): """Build a list of tokens from a string of wikicode and return it.""" split = self.regex.split(text) self._text = [segment for segment in split if segment] return self._parse()

This Page