编写Python文本解析器

文本块生成器
首先我们需要有一个文本块生成器把纯文本分成一个一个的文本块,以便接下来对每一个文本块进行解析,util.py代码如下:
#!/usr/bin/python
# encoding: utf-8
# 实现文本块生成器把纯文本分成一个一个的文本块,以便接下来对每一个文本块进行解析

def lines(file):
“””
生成器,在文本最后加一空行
“””
for line in file: yield line
yield ‘\n’

def blocks(file):
“””
生成器,生成单独的文本块
“””
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ”.join(block).strip()
block = []

strip()函数可以去除一个字符串前后的空格以及换行符,如果在strip()函数添加不同的参数,如strip(“me”),则可以去除字符串前后的”me”字符。
>>> s = ” This is me\n”
>>> s = s.strip()
>>> s
‘This is me’
>>> s = s.strip(“me”)
>>> s
‘This is ‘

yield()会返回一个生成器(generator)。如果对generator以及对yield语句不太熟悉,建议先阅读yield解释
处理程序
假设我们已经知道一个文本块是title/paragraph/heading/list,我们通过 handlers.py给他们打上合适的HTML标记,代码如下:
#!/usr/bin/python
# encoding: utf-8
# 为文本块打上合适的HTML标记

class Handler:
“””
处理程序父类
“””
def callback(self, prefix, name, *args):
method = getattr(self, prefix + name, None)
if callable(method): return method(*args)

def start(self, name):
self.callback(‘start_’, name)

def end(self, name):
self.callback(‘end_’, name)

def sub(self, name):
def substitution(match):
result = self.callback(‘sub_’, name, match)
if result is None: result = match.group(0)
return result
return substitution

class HTMLRenderer(Handler):
“””
HTML 处理程序,给文本块加相应的 HTML 标记
“””
def start_document(self):
print(‘ShiYanLou‘)

def end_document(self):
print(‘‘)

def start_paragraph(self):
print (‘

‘)

def end_paragraph(self):
print(‘

‘)

def start_heading(self):
print(‘

‘)

def end_heading(self):
print(‘

‘)

def start_list(self):
print(‘‘)

def end_list(self):
print(‘‘)

def start_listitem(self):
print(‘

  • ‘)

    def end_listitem(self):
    print(‘

  • ‘)

    def start_title(self):
    print(‘

    ‘)

    def end_title(self):
    print(‘

    ‘)

    def sub_emphasis(self, match):
    return(‘%s‘ % match.group(1))

    def sub_url(self, match):
    return ‘%s‘ % (match.group(1), match.group(1))

    def sub_mail(self, match):
    return ‘%s‘ % (match.group(1), match.group(1))

    def feed(self, data):
    print(data)

    在上面的代码中callable()函数能够检查一个函数是否能够被调用。如果能够被调用返回True。
    gerattr()函数则是返回一个对象的属性值。举例来说,getattr(x,’foo’,None)就相当于x.foo,而如果没有这个属性值foobar,则返回我们设定的默认值None。
    规则
    有了处理程序和文本块生成器,接下来就需要一定的规则来判断每个文本块交给处理程序将要加什么标记,rules.py代码如下:
    #!/usr/bin/python
    # encoding: utf-8
    # 设计一定的规则来判断每个文本块交给处理程序将要加什么标记

    class Rule:
    “””
    规则父类
    “””
    def action(self, block, handler):
    “””
    加标记
    “””
    handler.start(self.type)
    handler.feed(block)
    handler.end(self.type)
    return True

    class HeadingRule(Rule):
    “””
    一号标题规则
    “””
    type = ‘heading’
    def condition(self, block):
    “””
    判断文本块是否符合规则
    “””
    return not ‘\n’ in block and len(block) <= 70 and not block[-1] == ':' class TitleRule(HeadingRule): """ 二号标题规则 """ type = 'title' first = True def condition(self, block): if not self.first: return False self.first = False return HeadingRule.condition(self, block) class ListItemRule(Rule): """ 列表项规则 """ type = 'listitem' def condition(self, block): return block[0] == '-' def action(self, block, handler): handler.start(self.type) handler.feed(block[1:].strip()) handler.end(self.type) return True class ListRule(ListItemRule): """ 列表规则 """ type = 'list' inside = False def condition(self, block): return True def action(self, block, handler): if not self.inside and ListItemRule.condition(self, block): handler.start(self.type) self.inside = True elif self.inside and not ListItemRule.condition(self, block): handler.end(self.type) self.inside = False return False class ParagraphRule(Rule): """ 段落规则 """ type = 'paragraph' def condition(self, block): return True 解析 当我们知道每一个文本块进行怎么样的处理,交给谁去处理之后,我们就可以对整个文本进行解析了,markup.py代码如下: #!/usr/bin/python # encoding: utf-8 # 对整个文本进行解析的程序 import sys, re from handlers import * from util import * from rules import * class Parser: """ 解析器父类 """ def __init__(self, handler): self.handler = handler self.rules = [] self.filters = [] def addRule(self, rule): """ 添加规则 """ self.rules.append(rule) def addFilter(self, pattern, name): """ 添加过滤器 """ def filter(block, handler): return re.sub(pattern, handler.sub(name), block) self.filters.append(filter) def parse(self, file): """ 解析 """ self.handler.start('document') for block in blocks(file): for filter in self.filters: block = filter(block, self.handler) for rule in self.rules: if rule.condition(block): last = rule.action(block, self.handler) if last: break self.handler.end('document') class BasicTextParser(Parser): """ 纯文本解析器 """ def __init__(self, handler): Parser.__init__(self, handler) self.addRule(ListRule()) self.addRule(ListItemRule()) self.addRule(TitleRule()) self.addRule(HeadingRule()) self.addRule(ParagraphRule()) self.addFilter(r'\*(.+?)\*', 'emphasis') self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url') self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail') """ 运行程序 """ handler = HTMLRenderer() parser = BasicTextParser(handler) parser.parse(sys.stdin)