Hauptmenü öffnen
Babel
Diese Person spricht Deutsch als Muttersprache.
en-2
This user is able to contribute with an intermediate level of English.

mein freier Python Dump Parser, realisiert mit einem Zustandsautomat:

#!/bin/env python

import sys
import re
import codecs

def unicode_sequence(seq):
  return u"[" + u", ".join(u'"' + unicode(elem) + u'"' if isinstance(elem, basestring) else( u"_" + unicode(elem) + u"_" if not hasattr(elem, '__iter__') else unicode_sequence(elem)) for elem in seq) + u"]"

class respider:
  def __init__(self):
    self.states = {}
    self.current = None
    self.cache = []
    self.filters = {}
    self.filename = ""

  def add_filter(self, name, match):
    self.filters[name] = re.compile(match)

  def add_state(self, state):
    self.states[state["name"]] = state

  def run(self, inputfile, outputfile):
    fi = codecs.open(inputfile, "r", encoding="utf-8")
    fo = codecs.open(outputfile, "w", encoding="utf-8")
    self.start()
    for line in fi:
      fo.writelines(unicode_sequence(outputline) + u"\n" for outputline in self.step(line))
    self.stop()
    fi.close()
    fo.close()

  def __iter__(self):
    fi = codecs.open(self.filename, "r", encoding="utf-8")
    self.start()
    for line in fi:
      result = self.step(line)
      if len(result) > 0:
        yield result
    self.stop()
    fi.close()

  def start(self):
    for state in self.states:
      if self.states[state].has_key("start"):
        self.current = self.states[state]

  def stop(self):
    del self.current

  def step(self, input):
    result = []
    result_step = []
    visited = []
    current_delta = 0
    if self.current is not None:
      while current_delta < len(self.current["delta"]) or self.current not in visited:
        if self.current not in visited:
          visited.append(self.current)
        if self.current.has_key("action"):
          if "flush" in self.current["action"]:
            self.cache = []
            result_step = []
          if "uncache" in self.current["action"]:
            result += self.cache
            result += result_step
        m = self.filters[self.current["delta"][current_delta][0]].search(input)
        if m:
          #result_step.append((self.current["name"], self.current["delta"][current_delta][0], [group.encode("utf-8") for group in m.groups() if group is not None]))
          result_step.append((self.current["name"], self.current["delta"][current_delta][0], m.groups()))
          if self.current.has_key("action") and "cut" in self.current["action"]:
            input = input[m.end():]
          if self.current["delta"][current_delta][1] is not None:
            self.current = self.states[self.current["delta"][current_delta][1]]
            current_delta = 0
          else:
            current_delta += 1
        else:
          if self.current["delta"][current_delta][2] is not None:
            self.current = self.states[self.current["delta"][current_delta][2]]
            current_delta = 0
          else:
            current_delta += 1
    self.cache += result_step
    return result

#Beim Aufruf muss der erste Parameter die Eingabedatei enthalten
if len(sys.argv) > 1:
  filename = sys.argv[1]
else:
  filename = "dewiktionary-latest-pages-articles.xml"

my_spider = respider()
my_spider.filename = filename

#Hier folgen die Filterausdrücke, reguläre Ausdrücke (regex):
my_spider.add_filter("any", "")
my_spider.add_filter("all", "(.*)")
my_spider.add_filter("page", "<page>")
my_spider.add_filter("endpage", "</page>")
my_spider.add_filter("title", "<title>([^:]{2,}?)</title>")
my_spider.add_filter("text", r"<text.*?>== ([\w\s]*?) \(")
my_spider.add_filter("lang", r"\{\{Sprache\|Deutsch\}\}")
my_spider.add_filter("type", r"\{Wortart\|(.*?)\|Deutsch\}\}")
my_spider.add_filter("singular", r"SINGULAR=(.*)|Wer oder was.*?\(Einzahl(?: [1-9])?\)=(?:\s*)(?:der|die|das)?(?:\s*)(.*)")
my_spider.add_filter("plural", r"PLURAL=(.*)|Wer oder was.*?\(Mehrzahl(?: [1-9])?\)=(?:\s*)(?:die)?(?:\s*)(.*)")
my_spider.add_filter("phonetic", r":\[\[Hilfe:IPA\|IPA\]\]:(?:(.*?)\{\{Lautschrift\|(.+?)\}\})*")

#Folgende Zustände werden vom endlichen Automaten benutzt
#Dabei wird vor allem ein Titel und ein Textabschnitt gesucht und dann die notwendigen Angaben extrahiert.
my_spider.add_state(dict(name="start", delta=[("page", "getdata", None)], start=True, action=["flush", "cut"]))
my_spider.add_state(dict(name="getdata", delta=[("page", "start", None), ("title", None, None), ("text", None, None), ("lang", "getdata2", None), ("singular", None, None), ("plural", None, None), ("phonetic", None, None), ("type", None, None)]))
my_spider.add_state(dict(name="getdata2", delta=[("page", "end", None), ("endpage", "end", None), ("text", None, None), ("lang", None, None), ("singular", None, None), ("plural", None, None), ("phonetic", None, None), ("type", None, None)]))
my_spider.add_state(dict(name="end", delta=[("any", "start", None)], action=["uncache"]))

#Welche Angaben werden ausgewertet:
parse = ['title', 'singular', 'plural', 'type', 'phonetic']

#Welche werden benötigt:

#a, für alle Wortarten
essential_all = ['title', 'type']#, 'phonetic']
essential = {}

#b, nur für spezielle Wortarten
essential["Substantiv"] = ['singular', 'plural']

#wurden spezielle Erwartungen angegeben muss die Wortart hier aufgeführt werden:
known_types = ["Substantiv"]

unknown_types = {}

for es in essential:
  essential[es][:0] = essential_all

#Die Ausgabe erfolgt in die Dateien, die als zweite und dritte Parameter angegeben wurden
fo = codecs.open(sys.argv[2], "w", encoding="utf-8")
ferr = codecs.open(sys.argv[3], "w", encoding="utf-8")

complete = 0
count = 0
for result in my_spider:
  count += 1
  buffer = dict((item, [data[2] for data in result if data[1] == item]) for item in parse)
  if len(buffer["type"]) > 0:
    is_known_type = buffer["type"][0][0] in known_types
    if u"{" in buffer["type"][0][0] or u"}" in buffer["type"][0][0]:
      ferr.write(u"TypeError: " + buffer['title'][0][0] + " - " + buffer["type"][0][0] + u"\n")
    elif not is_known_type:
      if buffer["type"][0][0] in unknown_types:
        unknown_types[buffer["type"][0][0]] += 1
      else:
        unknown_types[buffer["type"][0][0]] = 1
    if all(len(buffer[item]) > 0 and any(any(grouppart is not None for grouppart in match) for match in buffer[item]) for item in (essential[buffer["type"][0][0]] if is_known_type else essential_all)):
      fo.write(buffer['title'][0][0] + u": " + unicode_sequence(buffer.items()) + u"\n")
      complete += 1
    else:
      ferr.write(u"Missing: " + buffer['title'][0][0] + u" - " + unicode_sequence((item, data) for (item, data) in buffer.items() if all(all(grouppart is None for grouppart in match) for match in data)) + u"\n")

fo.close()
ferr.close()

print complete, "/", count, "complete"
print unknown_types