diff --git a/occuresolver.py b/occuresolver.py new file mode 100644 index 0000000000000000000000000000000000000000..67d8441fb2be2cf1746340e29c8ebdd56714d049 --- /dev/null +++ b/occuresolver.py @@ -0,0 +1,59 @@ +# This module contains functions for resolve abbreviations according to the rules presented in: +# http://wiki-de.genealogy.net/Kartei_Leipziger_Familien + +import collections +import re +from functools import lru_cache + + +@lru_cache(maxsize=None) +def resolveroccu(occu): + """ + This function is used for the actual resolving of the abbreviation. It checks if the given occupational designation + matches any of the abbreviation rules and resolves them accordingly. + :param name: the input name + :return: the resolved abbreviation + """ + # abbreviation dictionary + abbreviationdict = collections.OrderedDict({ + "- u.": "$", + # extension of the specifications of the KLF; Problem: problem: e.g. red and tan tanners, "$" to recognise that it is not a separate profession + "B.": "&Bürger", # "&" to be able to distinguish legal status from profession + "Bg.": "Bürger", # extension of the specifications of the KLF + "Bgmstr.": "Bürgermeister", # extension of the specifications of the KLF + "Br.": "Brauer", + "Brbr.": "Branntweinbrenner", + "E.": "Einwohner", # "&" to be able to distinguish legal status from profession + "Fl.": "Fleischer", + "Gl.": "Glaser", + "GuS.": "Gold- und Silberdrahtzieher", + "h.": "händler", # ending of word + "Kr.": "Kramer", + "L.": "Leinenweber", + "m.": "macher", # ending of word + "Z.": "Zimmermann", + "Zg.": "Zimmergeselle", + "u. ": "", # "u." stands for "und" (and) and have to deleted, extension of the specifications of the KLF + "-": " ", + # extension of the specifications of the KLF; occupations are partly separated with a hyphen and not with a "u." + ",": "", + # extension of the specifications of the KLF; if there are more than two professions, there is a comma as a separator + "zu ": "@" + # extension of the specifications of the KLF; place indication is not a profession, "@" to recognise place indication + }) + + # patterns + # not all the rules are implemented yet, mainly the ones found in the sample + bracketpattern = re.compile(r"[(].*[)]") + occupattern = re.compile(r"[A-Za-z]+[.]") + + # pattern.search was used to check if a pattern matches since pattern.match only checks if the string is beginning + # with the pattern + if occupattern.search(occu): + for abbreviation, resolve in abbreviationdict.items(): + occu = occu.replace(abbreviation, resolve) + # get rid of brackets + if bracketpattern.search(occu): + occu = occu.replace("(", "") + occu = occu.replace(")", "") + return occu