Neue Datei hochladen

2dbb899a · Marcus Baumgarten · 715e1270 · 2dbb899a
Commit 2dbb899a authored 2 years ago by Marcus Baumgarten
--- a/occuresolver.py
+++ b/occuresolver.py
+# This module contains functions for resolve abbreviations according to the rules presented in:
+# http://wiki-de.genealogy.net/Kartei_Leipziger_Familien
+import collections
+import re
+from functools import lru_cache
+@lru_cache(maxsize=None)
+def resolveroccu(occu):
+    """
+    This function is used for the actual resolving of the abbreviation. It checks if the given occupational designation 
+    matches any of the abbreviation rules and resolves them accordingly.
+    :param name: the input name
+    :return: the resolved abbreviation
+    """
+    # abbreviation dictionary
+    abbreviationdict = collections.OrderedDict({
+        "- u.": "$",
+        # extension of the specifications of the KLF; Problem: problem: e.g. red and tan tanners, "$" to recognise that it is not a separate profession
+        "B.": "&Bürger",  # "&" to be able to distinguish legal status from profession
+        "Bg.": "Bürger",  # extension of the specifications of the KLF
+        "Bgmstr.": "Bürgermeister",  # extension of the specifications of the KLF
+        "Br.": "Brauer",
+        "Brbr.": "Branntweinbrenner",
+        "E.": "Einwohner",  # "&" to be able to distinguish legal status from profession
+        "Fl.": "Fleischer",
+        "Gl.": "Glaser",
+        "GuS.": "Gold- und Silberdrahtzieher",
+        "h.": "händler",  # ending of word
+        "Kr.": "Kramer",
+        "L.": "Leinenweber",
+        "m.": "macher",  # ending of word
+        "Z.": "Zimmermann",
+        "Zg.": "Zimmergeselle",
+        "u. ": "",  # "u." stands for "und" (and) and have to deleted, extension of the specifications of the KLF
+        "-": " ",
+        # extension of the specifications of the KLF; occupations are partly separated with a hyphen and not with a "u."
+        ",": "",
+        # extension of the specifications of the KLF; if there are more than two professions, there is a comma as a separator
+        "zu ": "@"
+        # extension of the specifications of the KLF; place indication is not a profession, "@" to recognise place indication
+    })
+    # patterns
+    # not all the rules are implemented yet, mainly the ones found in the sample
+    bracketpattern = re.compile(r"[(].*[)]")
+    occupattern = re.compile(r"[A-Za-z]+[.]")
+    # pattern.search was used to check if a pattern matches since pattern.match only checks if the string is beginning
+    # with the pattern
+    if occupattern.search(occu):
+        for abbreviation, resolve in abbreviationdict.items():
+            occu = occu.replace(abbreviation, resolve)
+    # get rid of brackets
+    if bracketpattern.search(occu):
+        occu = occu.replace("(", "")
+        occu = occu.replace(")", "")
+    return occu