mein freier Python Dump Parser, realisiert mit einem Zustandsautomat:
#!/bin/env python
import sys
import re
import codecs
def unicode_sequence(seq):
return u"[" + u", ".join(u'"' + unicode(elem) + u'"' if isinstance(elem, basestring) else( u"_" + unicode(elem) + u"_" if not hasattr(elem, '__iter__') else unicode_sequence(elem)) for elem in seq) + u"]"
class respider:
def __init__(self):
self.states = {}
self.current = None
self.cache = []
self.filters = {}
self.filename = ""
def add_filter(self, name, match):
self.filters[name] = re.compile(match)
def add_state(self, state):
self.states[state["name"]] = state
def run(self, inputfile, outputfile):
fi = codecs.open(inputfile, "r", encoding="utf-8")
fo = codecs.open(outputfile, "w", encoding="utf-8")
self.start()
for line in fi:
fo.writelines(unicode_sequence(outputline) + u"\n" for outputline in self.step(line))
self.stop()
fi.close()
fo.close()
def __iter__(self):
fi = codecs.open(self.filename, "r", encoding="utf-8")
self.start()
for line in fi:
result = self.step(line)
if len(result) > 0:
yield result
self.stop()
fi.close()
def start(self):
for state in self.states:
if self.states[state].has_key("start"):
self.current = self.states[state]
def stop(self):
del self.current
def step(self, input):
result = []
result_step = []
visited = []
current_delta = 0
if self.current is not None:
while current_delta < len(self.current["delta"]) or self.current not in visited:
if self.current not in visited:
visited.append(self.current)
if self.current.has_key("action"):
if "flush" in self.current["action"]:
self.cache = []
result_step = []
if "uncache" in self.current["action"]:
result += self.cache
result += result_step
m = self.filters[self.current["delta"][current_delta][0]].search(input)
if m:
#result_step.append((self.current["name"], self.current["delta"][current_delta][0], [group.encode("utf-8") for group in m.groups() if group is not None]))
result_step.append((self.current["name"], self.current["delta"][current_delta][0], m.groups()))
if self.current.has_key("action") and "cut" in self.current["action"]:
input = input[m.end():]
if self.current["delta"][current_delta][1] is not None:
self.current = self.states[self.current["delta"][current_delta][1]]
current_delta = 0
else:
current_delta += 1
else:
if self.current["delta"][current_delta][2] is not None:
self.current = self.states[self.current["delta"][current_delta][2]]
current_delta = 0
else:
current_delta += 1
self.cache += result_step
return result
#Beim Aufruf muss der erste Parameter die Eingabedatei enthalten
if len(sys.argv) > 1:
filename = sys.argv[1]
else:
filename = "dewiktionary-latest-pages-articles.xml"
my_spider = respider()
my_spider.filename = filename
#Hier folgen die Filterausdrücke, reguläre Ausdrücke (regex):
my_spider.add_filter("any", "")
my_spider.add_filter("all", "(.*)")
my_spider.add_filter("page", "<page>")
my_spider.add_filter("endpage", "</page>")
my_spider.add_filter("title", "<title>([^:]{2,}?)</title>")
my_spider.add_filter("text", r"<text.*?>== ([\w\s]*?) \(")
my_spider.add_filter("lang", r"\{\{Sprache\|Deutsch\}\}")
my_spider.add_filter("type", r"\{Wortart\|(.*?)\|Deutsch\}\}")
my_spider.add_filter("singular", r"SINGULAR=(.*)|Wer oder was.*?\(Einzahl(?: [1-9])?\)=(?:\s*)(?:der|die|das)?(?:\s*)(.*)")
my_spider.add_filter("plural", r"PLURAL=(.*)|Wer oder was.*?\(Mehrzahl(?: [1-9])?\)=(?:\s*)(?:die)?(?:\s*)(.*)")
my_spider.add_filter("phonetic", r":\[\[Hilfe:IPA\|IPA\]\]:(?:(.*?)\{\{Lautschrift\|(.+?)\}\})*")
#Folgende Zustände werden vom endlichen Automaten benutzt
#Dabei wird vor allem ein Titel und ein Textabschnitt gesucht und dann die notwendigen Angaben extrahiert.
my_spider.add_state(dict(name="start", delta=[("page", "getdata", None)], start=True, action=["flush", "cut"]))
my_spider.add_state(dict(name="getdata", delta=[("page", "start", None), ("title", None, None), ("text", None, None), ("lang", "getdata2", None), ("singular", None, None), ("plural", None, None), ("phonetic", None, None), ("type", None, None)]))
my_spider.add_state(dict(name="getdata2", delta=[("page", "end", None), ("endpage", "end", None), ("text", None, None), ("lang", None, None), ("singular", None, None), ("plural", None, None), ("phonetic", None, None), ("type", None, None)]))
my_spider.add_state(dict(name="end", delta=[("any", "start", None)], action=["uncache"]))
#Welche Angaben werden ausgewertet:
parse = ['title', 'singular', 'plural', 'type', 'phonetic']
#Welche werden benötigt:
#a, für alle Wortarten
essential_all = ['title', 'type']#, 'phonetic']
essential = {}
#b, nur für spezielle Wortarten
essential["Substantiv"] = ['singular', 'plural']
#wurden spezielle Erwartungen angegeben muss die Wortart hier aufgeführt werden:
known_types = ["Substantiv"]
unknown_types = {}
for es in essential:
essential[es][:0] = essential_all
#Die Ausgabe erfolgt in die Dateien, die als zweite und dritte Parameter angegeben wurden
fo = codecs.open(sys.argv[2], "w", encoding="utf-8")
ferr = codecs.open(sys.argv[3], "w", encoding="utf-8")
complete = 0
count = 0
for result in my_spider:
count += 1
buffer = dict((item, [data[2] for data in result if data[1] == item]) for item in parse)
if len(buffer["type"]) > 0:
is_known_type = buffer["type"][0][0] in known_types
if u"{" in buffer["type"][0][0] or u"}" in buffer["type"][0][0]:
ferr.write(u"TypeError: " + buffer['title'][0][0] + " - " + buffer["type"][0][0] + u"\n")
elif not is_known_type:
if buffer["type"][0][0] in unknown_types:
unknown_types[buffer["type"][0][0]] += 1
else:
unknown_types[buffer["type"][0][0]] = 1
if all(len(buffer[item]) > 0 and any(any(grouppart is not None for grouppart in match) for match in buffer[item]) for item in (essential[buffer["type"][0][0]] if is_known_type else essential_all)):
fo.write(buffer['title'][0][0] + u": " + unicode_sequence(buffer.items()) + u"\n")
complete += 1
else:
ferr.write(u"Missing: " + buffer['title'][0][0] + u" - " + unicode_sequence((item, data) for (item, data) in buffer.items() if all(all(grouppart is None for grouppart in match) for match in data)) + u"\n")
fo.close()
ferr.close()
print complete, "/", count, "complete"
print unknown_types