diff --git a/2022_005_goldberg/Skripte/Hauptprogramm/main.py b/2022_005_goldberg/Skripte/Hauptprogramm/main.py
deleted file mode 100644
index 349287003e173101cd427d49d7188dccd8f58685..0000000000000000000000000000000000000000
--- a/2022_005_goldberg/Skripte/Hauptprogramm/main.py
+++ /dev/null
@@ -1,1272 +0,0 @@
-import csv
-import os.path
-from multiprocessing import Pool, current_process
-from functools import partial
-import time
-import json
-import Levenshtein
-import copy
-
-
-def loadData(filename, delimiter, encoding):
-    """
-    This function is used to open files in which data is temporarily stored and was created by the program in a previous run.
-    :param filename: designation of the file
-    :param delimiter: type of delimiter as string
-    :return: list of dictionaries with information of the file to be loaded
-    """
-    content = []  # list oft dicts
-    try:
-        with open(filename, "r", encoding=encoding) as data:  # , errors='ignore'
-            for i in csv.DictReader(data, delimiter=delimiter):
-                i = json.loads(json.dumps(i))
-                content.append(i)
-    except FileNotFoundError:
-        print("Status: Inital pass for file", filename, "(no list created yet).")
-    return (content)
-
-
-def appendFile(filename, data, fieldnames):
-    """
-    This function describes CSV files.
-    :param filename: designation of the file (string)
-    :param data: type of delimiter (string)
-    :param fieldnames: column names of the file to be written (list of strings)
-    """
-    opener = open(filename, "a", newline='', encoding="utf-8")
-    writer = csv.DictWriter(opener, fieldnames=fieldnames, delimiter="\t")
-    # differentiation of cases where one or more lines are to be added
-    # for the file "qualityofgedcom.csv" only one line should be written at a time
-    # for all other files several lines should be written
-    if filename == "qualityofgedcom.csv":
-        writer.writerow(data)
-    else:
-        writer.writerows(data)
-    opener.close()
-
-
-def createFile(filename, fieldnames, delimiter, encoding):
-    """
-    This function creates a new file if no file already exists under this name.
-    The function is also used to load data when it is clear that the file already exists.
-    :param filename: designation of the file (string)
-    :param fieldnames: column names of the file to be written (list of strings)
-    :param delimiter: type of delimiter (string)
-    :return: list of dictionaries with information of the file to be loaded
-    """
-    content = loadData(filename, delimiter, encoding)
-    # create a new file if it is not there
-    if len(content) == 0:  # check if the variable does not contain any data
-        opener = open(filename, "w", newline='', encoding="utf-8-sig")
-        writer = csv.writer(opener, delimiter=delimiter)
-        writer.writerow(fieldnames)
-        opener.close()
-    return (content)
-
-
-def loadGedcomFile(filename):
-    """
-    This function loads the data of a GEDCOM file and writes them line by line into a list.
-    :param filename: name of the file
-    :return: in case of error "NONE", otherwise a list with the information of the GEDCOM file
-    """
-    # define file path
-    filepath = os.path.join("data", filename)
-    preparedData = []
-    try:
-        gedcom = open(filepath, "r", encoding="utf-8")
-        data = gedcom.readline()
-        # initial transfer of the headline
-        data = data[:-1]  # delete the unimportant last character of each line
-        while data != "":  # last line is empty
-            data = str(gedcom.readline())
-            data = data[:-1]  # delete the unimportant last character of each line
-            preparedData.append(data)
-        gedcom.close()
-        return (preparedData)
-    except FileNotFoundError:
-        print("Error: There is a problem with access to the file", filename, ".")
-        return ("NONE")
-
-
-def separator(occu, replaced, replacer):
-    """
-    This function is used to replace separation operators.
-    :param occu: string that is processed
-    :param replaced: content to be replaced
-    :param replacer: place of the one to be replaced
-    :return: new string with changed content
-    """
-    if replaced in occu:
-        occu = occu.replace(replaced, replacer)
-    return (occu)
-
-
-def endOfString(phrase, signalWord):
-    """
-    This function is used to detect the position of an element of a string.
-    The respective end position of a part is determined, if it exists.
-    Everything before this position is removed.
-    :param phrase: string to be searched (string)
-    :param signalWord: displays a place name (string)
-    :return: text after the end position of the signal word in the phrase
-    """
-    # if phrase contains the signal word, then find end position of the signal word and remove everything behind
-    if signalWord in phrase:
-        endOfString = phrase[(phrase.find(signalWord) + len(signalWord)):]
-        return (endOfString)
-    return ("")
-
-
-def replaceLoc(signalWord, phrase, loc):
-    """
-    This function is used to store location names.
-    :param signalWord: displays a place name (string)
-    :param phrase: string to be searched (string)
-    :param loc: designation of a place (string)
-    :return: adjusted occupation phrase
-    """
-    if signalWord in phrase:
-        phrase = phrase.replace(signalWord, "")  # remove "signalWord"
-        phrase = phrase.replace(loc, "")  # remote location
-    return (phrase)
-
-
-def dictSearch(relevantDict, key, relevantObject):
-    """
-    This function searches a given list of dictionaries for a searched value and specifies the key.
-    :param relevantDict: list of dictionaries that will be searched
-    :param key: key of the dictionary to be studied
-    :param relevantObject: name of the value to be searched for under the key in the Dictionary
-    :return: number of the searched dictionary in the list (if none is found "-1")
-    """
-    # search per list comprehension
-    # note: upper and lower case is relevant here
-    occuIndex = next((index for (index, d) in enumerate(relevantDict) if d[key] == relevantObject), None)
-    if occuIndex is None:
-        return (-1)  # if it could not be found
-    return (occuIndex)
-
-
-def partCorrector(phrase, existingVariantsKldB):
-    """
-    This function cleans up a location specification.
-    Information that is not related to the location will be filtered out.
-    In addition, an attempt is made to find a lemma for this occupation.
-    :param phrase: occupation (string)
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :return: information about the occupation (dictionary)
-    """
-    # initialization of variables, so that exist
-    titel = ""
-    role = ""
-    year = ""
-    url = ""
-    brackets = ""
-
-    # step 5: geographic prepositions ("loc" stands for location)
-    # find and save place names
-    # place name is currently overwritten if several of them occur
-    # there are signal words that indicate a subsequent location (e.g. "in", "im")
-    loc = endOfString(phrase, " in ")  # "loc" is needed for the upcoming function
-    phrase = replaceLoc(" in ", phrase, loc)
-    loc = endOfString(phrase, " im ")
-    phrase = replaceLoc(" im ", phrase, loc)
-    loc = endOfString(phrase, " In ")
-    phrase = replaceLoc(" In ", phrase, loc)
-    loc = endOfString(phrase, " i. ")
-    phrase = replaceLoc(" i. ", phrase, loc)
-    loc = endOfString(phrase, " von ")
-    phrase = replaceLoc(" von ", phrase, loc)
-    loc = endOfString(phrase, " v. ")
-    phrase = replaceLoc(" v. ", phrase, loc)
-    loc = endOfString(phrase, " zu ")
-    phrase = replaceLoc(" zu ", phrase, loc)
-    loc = endOfString(phrase, " auf ")
-    phrase = replaceLoc(" auf ", phrase, loc)
-    loc = endOfString(phrase, " aus ")
-    phrase = replaceLoc(" aus ", phrase, loc)
-    loc = endOfString(phrase, " Aus ")
-    phrase = replaceLoc(" Aus ", phrase, loc)
-    loc = endOfString(phrase, " an ")
-    phrase = replaceLoc(" an ", phrase, loc)
-    loc = endOfString(phrase, " der ")
-    phrase = replaceLoc(" der ", phrase, loc)
-    loc = endOfString(phrase, " des ")
-    phrase = replaceLoc(" des ", phrase, loc)
-    loc = endOfString(phrase, " van ")
-    phrase = replaceLoc(" van ", phrase, loc)
-
-    # besides location information there are signal words for employers
-    # "loc" continues to be used here, even though the literal sense no longer fits here
-    loc = endOfString(phrase, " bei ", )
-    phrase = replaceLoc(" bei ", phrase, loc)
-    loc = endOfString(phrase, " bei dem ")
-    phrase = replaceLoc(" bei dem ", phrase, loc)
-    loc = endOfString(phrase, " beim ")
-    phrase = replaceLoc(" beim ", phrase, loc)
-    loc = endOfString(phrase, " bei der ")
-    phrase = replaceLoc(" bei der ", phrase, loc)
-
-    # then there are signal words in front of an occupation, which makes clear the affiliation to a dominion
-    affiliation = ["herrschaftlich", "herrschaftliche", "herrschaftlicher", "königlich", "königliche", "königlicher",
-                   "fürstlich", "fürstliche", "fürstlicher"]
-    for i in affiliation:
-        if i in phrase:
-            # this information should not be deleted from the occupation statement
-            # it should only be stored in "loc" to be output separately afterwards
-            # if "loc" is empty, then no comma should precede it
-            if loc != "":
-                loc = loc + ", " + i
-            else:
-                loc = i
-
-    # find and save years
-    # more detailed dates are made to year information
-    # assumption: Year numbers always have four digits and are at the beginning
-    # check if the first character is a number
-    if phrase[:1].isdigit() is True:
-        # check if the first four characters are a number
-        if phrase[:4].isdigit() is True:
-            # separate year and part behind
-            year = phrase[:4]
-            phrase = phrase[4:]
-
-    # brackets content
-    if "(" in phrase and ")" in phrase:
-        brackets = phrase[phrase.find("("):phrase.find(")")]
-        phrase = phrase[:phrase.find("(")] + phrase[phrase.find(")") + 2:]  # +2 because of parenthesis and space
-    if "[" in phrase and "]" in phrase:
-        brackets = phrase[phrase.find("["):phrase.find("]")]
-        phrase = phrase[:phrase.find("[")] + phrase[phrase.find("]") + 2:]  # +2 because of parenthesis and space
-
-    # find and save URLs
-    # example: <a href="https:undde.wikipedia.org/wiki/Geschichte_des_Kantons_Thurgau#Grafen_im_Thurgau">Graf im Thurgau</a>
-    if "<a" in phrase and "</a>" in phrase:
-        url = phrase[phrase.find("<a"):phrase.find("</a>")]
-        phrase = phrase[:phrase.find("<a")] + phrase[phrase.find("</a>"):]
-
-    # find and save role
-    # wife
-    if "F. d." in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "F. d.")
-    if "Ehefrau des" in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau des")
-    if "Ehefrau d." in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau d.")
-    if "Ehefrau" in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau")
-    if "frau" in phrase and "Haus" != phrase[:4] and "Acker" != phrase[:5]:
-        role = "Frau"
-        phrase = phrase.replace("sfrau", "")
-        phrase = phrase.replace("frau", "")
-    # daugther
-    if "T. d." in phrase:
-        role = "Tochter"
-        phrase = endOfString(phrase, "T. d.")
-    if "tochter" in phrase:
-        role = "Tochter"
-        phrase = phrase.replace("stochter", "")
-        phrase = phrase.replace("tochter", "")
-    # son
-    if "S. d." in phrase:
-        role = "Sohn"
-        phrase = endOfString(phrase, "S. d.")
-    if "sohn" in phrase:
-        role = "Sohn"
-        phrase = phrase.replace("ssohn", "")
-        phrase = phrase.replace("sohn", "")
-
-    # find and save titles
-    if "Prof." in phrase:
-        titel = "Professor"
-        phrase = endOfString(phrase, "Prof.")
-    if "Professor" in phrase:
-        titel = "Professor"
-        phrase = endOfString(phrase, "Professor")
-
-    # step 9: temporal prepositions and numerals
-    if " am " in phrase:
-        year = endOfString(phrase, " am ")
-        phrase = phrase.replace(" am ", "")
-        phrase = phrase.replace(year, "")
-    if " bis " in phrase:
-        year = endOfString(phrase, " bis ")
-        phrase = phrase.replace(" bis ", "")
-        phrase = phrase.replace(year, "")
-
-    # delete numbers, unless they end with a dot or there are 4 consecutive digits, then this is taken as year
-    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
-    numberLength = 0
-    prePart = phrase
-    for i in range(len(phrase)):
-        if prePart[i:i + 1] in numbers:
-            numberLength = numberLength + 1
-            if prePart[i + 1:i + 2] != "." and prePart[i + 1:i + 2] not in numbers:
-                if numberLength == 4:
-                    year = prePart[i - 3:i + 1]
-                    phrase = phrase.replace(year, "")
-                    numberLength = 0
-                else:
-                    phrase = phrase.replace(phrase[i - numberLength + 1:i + 1], "")
-                    numberLength = 0
-            elif phrase[i + 1:i + 2] == ".":
-                numberLength = 0
-
-    # remove remaining special characters
-    phrase = phrase.replace(":", "")
-
-    # remove blanks here again
-    # "cleanedOccupation" is what remains of the occupation specification
-    cleanedOccupation = phrase.strip()
-
-    # search if there is a corresponding pedant in the already classified occupational data
-    occuIndex = dictSearch(existingVariantsKldB, "variant", cleanedOccupation)
-    # if occuIndex is not "-1", then a counterpart was found
-    if occuIndex != -1:
-        # KldB identifier
-        kldb = existingVariantsKldB[occuIndex]["code"]
-        # way of selection of a counterpart
-        info = "found direct"
-        # levDict stands in front of Levenshtein dictionary
-        # name not appropriate here, because no Levenshtein distance is used
-        # for uniformity of the variable it is used anyway
-        levDict = {"lemma row": occuIndex,  # line of the matching dictionary
-                   "variant": "",
-                   "best fit lemma": existingVariantsKldB[occuIndex]["variant"],
-                   # designation of the appropriate occupation
-                   "absolute distance": "",
-                   "relative distance": "",
-                   "selection": ""
-                   }
-    # if occuIndex is "-1", no counterpart was found and a similarity analysis starts
-    elif occuIndex == -1 and cleanedOccupation != "":  # cleanedOccupation must not be empty
-        # similarity analysis
-        levDict = levenshteinDist(existingVariantsKldB, "variant", cleanedOccupation, "code")
-        # setting the relative Levenshtein distance of 0.25 as the essential threshold for selection
-        if levDict["relative distance"] < 0.25:
-            levDict.update({"selection": 1})
-            kldb = existingVariantsKldB[levDict["lemma row"]]["code"]  # take the line here from the levDict
-            # way of selection of a counterpart
-            info = "found after levenshtein"
-        else:
-            # no counterpart found
-            levDict.update({"selection": 0})
-            kldb = ""
-            info = "not found"
-    # no occupation remains
-    else:
-        kldb = ""
-        info = "no occupational designation"
-        levDict = {"lemma row": "", "variant": "", "best fit lemma": "", "absolute distance": "",
-                   "relative distance": "", "selection": ""}
-
-    # store the information sorted for each phrase (occupation)
-    occupationResult = {
-        "occupation": cleanedOccupation,
-        "best fit lemma": levDict["best fit lemma"],
-        "row of best fit lemma": levDict["lemma row"],
-        "KldB 2010": kldb,
-        "titel": titel,
-        "role": role,
-        "location": loc,
-        "year": year,
-        "url": url,
-        "further info": brackets,
-        "selection info": info,
-        "similarity analysis": levDict,
-        "lemma row": levDict["lemma row"],
-        "absolute distance": levDict["absolute distance"],
-        "relative distance": levDict["relative distance"]
-    }
-    return (occupationResult)
-
-
-def abbreviationsCorrector(firstString, secondString):
-    """
-    This function compares two phrases and checks if one of them could be an abbreviation of the other.
-    If "s"econdString" is an abbreviation of "firstString", "firstString" will be returned truncated.
-    :param firstString: first phrase without abbreviation (string)
-    :param secondString: second phrase with abbreviation (string)
-    :return: resolved abbreviation of "firstString" (string)
-    """
-    # continue only if there is a dot in "secondString"
-    # first letters equal to runtime improvement
-    if "." in secondString and secondString[:1] == firstString[:1]:
-        positionDot = secondString.find(".")
-        # find the abbreviated part in the other string and delete it in the original name
-        # count backwards to find blanks
-        for position in range(positionDot, 0, -1):
-            if secondString[positionDot:positionDot + 1] == " ":
-                beforeDot = secondString[position:positionDot]
-                break;
-            elif position == 1:
-                beforeDot = secondString[:positionDot]
-
-        # testing minimum length
-        try:
-            # minimum length before 3 letters
-            if positionDot - position < 4:
-                # if less than three letters, return original value
-                return (firstString)
-        except UnboundLocalError:
-            position = 0
-            beforeDot = secondString[position:positionDot]
-            # minimum length before 3 letters
-            if positionDot - position < 4:
-                # if less than three letters, return original value
-                return (firstString)
-
-        if beforeDot in firstString:
-            positionPart = firstString.find(beforeDot) + len(beforeDot)
-            for position in range(positionPart, len(firstString) + 1):
-                # blank, hyphen or general end; +1 is allowed here, is then simply empty
-                if firstString[position:position + 1] == " " or firstString[
-                                                                position:position + 1] == "-" or position == len(
-                    firstString):
-                    positionEnd = position
-                    break;
-            # abbreviation found, abbreviate original name
-            firstString = firstString[:positionPart] + ". " + firstString[positionEnd:]
-    return (firstString)
-
-
-def levenshteinDist(existingVariantsKldB, key, relevantObject, keyRelevantDict):
-    """
-    This function generates the Levenshtein distance between two strings.
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :param key: designation of the key for the "relevantDict" (string)
-    :param relevantObject: occupation for which a similar, already classified value is to be found (string)
-    :param keyRelevantDict: name of the column that contains for the identifier (string)
-    :return: information on similarity analysis (dictionary)
-    """
-    # the best fitting value is to be found
-    # initial high values for a Levenshtein distance, which are undercut in any case
-    minimalDistAbs = 99999  # absolute
-    minimalDistRel = 99999  # relative
-    # binary variable, 0 if no hit was found, 1 if at least one hit was found
-    minOneFound = 0
-    # check against each existing entry
-    for counter, i in enumerate(existingVariantsKldB):
-        # Lower case for better comparability
-        relevantObjectLowerCase = relevantObject.lower()
-        existingVariantLowerCase = copy.copy(
-            i[key]).lower()  # copy important because it is changed afterwards
-        # compare only if first letters are the same (serves to improve runtime)
-        if existingVariantLowerCase[:1] == relevantObjectLowerCase[:1]:
-            # calculate Levenshtein distance
-            levDistAbs = Levenshtein.distance(existingVariantLowerCase, relevantObjectLowerCase)
-            # levDist multiply with number of blanks (+1) to avoid "gewesener königlicher Richter"/"gewesener königlicher Koch"
-            levDistRel = levDistAbs * (relevantObject.count(" ") + 1) / len(relevantObject)
-            # when the next one fits better
-            if levDistRel < minimalDistRel:
-                minimalDistAbs = levDistAbs
-                minimalDistRel = levDistRel
-                bestFitLemma = i[key]
-                cacheCounter = counter
-                # is overwritten until an equal one comes along
-                hitlist = [[i[key], cacheCounter]]
-            # if the next one fits equally well
-            if levDistRel == minimalDistRel:
-                hitlist.append([i[key], counter])
-            # at least one hit
-            minOneFound = 1
-        # no similarity
-        else:
-            continue;
-
-    # select one in case of multiple hits
-    # selection is made by greatest match from the front (matching letters)
-    try:
-        # if there were several hits of the same quality
-        # anything above 0.25 is assumed to be unrealistic here, serves to improve runtime
-        if len(hitlist) > 1 and minimalDistRel < 0.25:
-            # initialization of counters
-            numberMatchingChars = 0
-            maxNumberMatchingChars = 0
-            numberMatchingCharsList = []
-            for charPosition, j in enumerate(hitlist):
-                # if the respective letters of the strings to be compared are the same
-                if j[0][charPosition:charPosition + 1] == relevantObject[charPosition:charPosition + 1]:
-                    # count up
-                    numberMatchingChars = numberMatchingChars + 1
-                    # note the maximum number of characters
-                    maxNumberMatchingChars = numberMatchingChars
-                # reset, if another character comes
-                else:
-                    numberMatchingChars = 0
-                numberMatchingCharsList.append([charPosition, maxNumberMatchingChars])
-
-            # Selection of the result with the closest match (no longer has anything to do with Levenshtein distance)
-            longestMatch = 0
-            # iterate all results of the maxNumberMatchingCharsList
-            for j in numberMatchingCharsList:
-                # select so most suitable
-                if j[1] > longestMatch:  # [1] is maxNumberMatchingChars
-                    longestMatch = j[1]
-                    charPosition = j[0]  # [0] is charPosition
-                # there can be best results for the same time
-                # that is ignored at this point
-                # only one status message is issued
-                # the second, equally matching value, is not selected
-            #        if j[1] == longestMatch:
-            # this may be due to the fact that equal values are compared
-            # duplicates exist in the list of already classified occupational data
-            # therefore values to be compared can be the same
-            #            if hitlist[j[0]][0] == hitlist[charPosition][0]:
-            #                print("Status: A dublette exists in the list of possible hits(" + hitlist[j[0]][0] + ", " + hitlist[charPosition][0] + ")")
-            #                continue
-            # but the values do not always have to be the same, they can also just have the same beginning
-            #            print("Status: Two very similar values exist in the list of possible hits(" + hitlist[j[0]][0] + ", " + relevantObject + ")")
-
-            # overwrite the relevant variables
-            bestFitLemma = hitlist[charPosition][0]
-            cacheCounter = hitlist[charPosition][1]
-    except UnboundLocalError:
-        pass;
-
-    # alternative, if the possibility above did not lead to success
-    # this may be due to the fact that abbreviations are included
-    if minimalDistRel >= 0.25:
-        # search for abbreviations marked with a dot
-        for counter, i in enumerate(existingVariantsKldB):
-            designationCopy = relevantObject.lower()
-            originalDesignation = copy.copy(i[key]).lower()  # copy important because it is changed afterwards
-            # only if first letters are equal (runtime improvement)
-            if originalDesignation[:1] == designationCopy[:1]:
-                # abbreviation handling
-                preDesignationCopy = designationCopy  # save previous value
-                designationCopy = abbreviationsCorrector(designationCopy, originalDesignation)
-                if designationCopy == preDesignationCopy:
-                    # the same again the other way around
-                    originalDesignation = abbreviationsCorrector(originalDesignation, designationCopy)
-                levDist = Levenshtein.distance(originalDesignation, designationCopy)
-                if levDist < minimalDistAbs:  # minimalDistRel
-                    minimalDistAbs = levDist  # minimalDistRel
-                    # if the new value is smaller, then overwrite relevant variables
-                    bestFitLemma = i[key]
-                    cacheCounter = counter
-                # at least one hit
-                minOneFound = 1
-
-    if minOneFound == 0:
-        bestFitLemma = "nothing"  # occurs, if e.g. the first letter is a colon; there is no variant to
-        cacheCounter = -1
-    # merge information
-    levenDict = {
-        "lemma row": cacheCounter,
-        "variant": relevantObject,
-        "best fit lemma": bestFitLemma,
-        "absolute distance": minimalDistAbs,
-        "relative distance": minimalDistRel
-    }
-    return (levenDict)
-
-
-def occuCleaner(occu, existingVariantsKldB):
-    """
-    This function cleans up individual occupation information.
-    It is also essential that various information is separated from the original job title.
-    This can concern several job titles, but also non-professional information.
-    :param occu: occupational title
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :return: information about the different occupational indications in the original indication (dictionary)
-    """
-
-    # storage of the original occupational title
-    originalOccu = occu
-
-    # print(occu)
-
-    # initialization
-    # "occu1" does not need to be initialized because there is at least one occupation specification
-    occu2 = {}  # ""
-    occu3 = {}  # ""
-    occu4 = {}  # ""
-    occu5 = {}  # ""
-
-    # initialization
-    part1 = ""
-    part2 = ""
-    part3 = ""
-    part4 = ""
-    part5 = ""
-
-    # general preprocessing
-
-    # step 1: Remove spaces at the beginning and end
-    occu = occu.strip()
-
-    # step 2: Write out abbreviations
-    if "mstr." in occu:
-        occu = occu.replace("mstr.", "meister")
-    if "Ing." in occu:
-        occu = occu.replace("Ing.", "Ingenieur")
-
-    # step 3: Normalize separation operators
-    occu = separator(occu, " u.", " und")
-    occu = separator(occu, "+", " und ")  # there are also "und" (and) without spaces
-    occu = separator(occu, ", ", " und ")
-    occu = separator(occu, ",", " und ")
-    occu = separator(occu, "; ", " und ")
-    occu = separator(occu, " & ", " und ")
-    occu = separator(occu, " / ", " und ")
-    occu = separator(occu, "/", " und ")
-
-    # detail processing
-
-    # separate multiple occupations
-    partList = [part1, part2, part3, part4, part5]  # parts are still all empty here
-    partCounter = 0
-    separation = " und "
-    partList[0] = occu  # is needed for initialization because the while loop accesses the next one
-    # < 4, because not infinite parts should be made
-    while separation in partList[partCounter] and partCounter < 4:
-        st = partList[partCounter]
-        # exeptation: do not seperate when "-" before "und", f. e. "Kauf- und Handelsmann", or in "k. u. k."
-        if "- und " not in st and "k. und k." not in st:
-            partList[partCounter] = st[:st.find(" und ")]  # first part
-            partList[partCounter + 1] = st[(st.find(" und ") + len(" und ")):]  # second part
-        partCounter = partCounter + 1
-
-    # write back values from the partList
-    part1 = partList[0]
-    part2 = partList[1]
-    part3 = partList[2]
-    part4 = partList[3]
-    part5 = partList[4]
-
-    if partCounter == 0:  # if there is only one part
-        part1 = occu
-
-    # the content of the individual professional data is added to the dictionary afterwards
-    # only fill in if there is really content there
-    occu1 = partCorrector(part1, existingVariantsKldB)
-    if part2 != "":  # if there is no part2, then just keep going
-        occu2 = partCorrector(part2, existingVariantsKldB)
-        if part3 != "":  # can only be if there was a part2 beforehand
-            occu3 = partCorrector(part3, existingVariantsKldB)
-            if part4 != "":
-                occu4 = partCorrector(part4, existingVariantsKldB)
-                if part5 != "":
-                    occu5 = partCorrector(part5, existingVariantsKldB)
-
-    # information about the different occupational indications in the original indication
-    occuDictOfDicts = {
-        "variant": originalOccu,
-        "occupation 1": occu1,  # occu1 is a dictionary with occupation information
-        "occupation 2": occu2,
-        "occupation 3": occu3,
-        "occupation 4": occu4,
-        "occupation 5": occu5
-    }
-
-    return (occuDictOfDicts)
-
-
-def statistics(occuList, occuKeys):
-    """
-    This function counts the number of lemmatizations over the different process branches.
-    :param occuList: list of dictionaries with information to analysed occupational information
-    :param occuKeys: column headings for the analysis of separated occupations
-    """
-    # initialization of counters
-    counter = 0  # found directly in existing variants
-    counter0 = 0  # empty occupational designations (only came about as a result of cleanup, e.g. because only location information was given)
-    counter2 = 0  # found by Levenshtein distance
-    counter3 = 0  # could not be found
-    counter4 = 0  # found by Levenshtein distance NV
-    counter5 = 0  # found directly in existing variants NV
-
-    for i in occuList:
-        try:
-            # iterate the five possible keys ("occupation 1", ...)
-            for key in occuKeys:
-                # if the entry for the key does not contain any content, skip it
-                if i == [] or i[key] == {}:
-                    continue;
-                elif i[key]["selection info"] == "found direct":
-                    counter = counter + i["number"]
-                elif i[key]["selection info"] == "found after levenshtein":
-                    counter2 = counter2 + i["number"]
-                elif i[key]["selection info"] == "not found":
-                    counter3 = counter3 + i["number"]
-                elif i[key]["selection info"] == "no occupational designation":
-                    counter0 = counter0 + i["number"]
-                elif i[key]["selection info"] == "found after levenshtein NV":
-                    counter4 = counter4 + i["number"]
-                elif i[key]["selection info"] == "found direct NV":
-                    counter5 = counter5 + i["number"]
-                else:
-                    print("Error: Selection information is missing.")
-        except:
-            # iterate the five possible keys ("occupation 1", ...)
-            for key in occuKeys:
-                # if the entry for the key does not contain any content, skip it
-                if i == [] or i[0][key] == {}:
-                    continue;
-                elif i[0][key]["selection info"] == "found direct":
-                    counter = counter + i[0]["number"]
-                elif i[0][key]["selection info"] == "found after levenshtein":
-                    counter2 = counter2 + i[0]["number"]
-                elif i[0][key]["selection info"] == "not found":
-                    counter3 = counter3 + i[0]["number"]
-                elif i[0][key]["selection info"] == "no occupational designation":
-                    counter0 = counter0 + i[0]["number"]
-                elif i[0][key]["selection info"] == "found after levenshtein NV":
-                    counter4 = counter4 + i[0]["number"]
-                elif i[0][key]["selection info"] == "found direct NV":
-                    counter5 = counter5 + i[0]["number"]
-                else:
-                    print("Error: Selection information is missing.")
-
-    # output of statistical information
-    counterSum = counter0 + counter + counter2 + counter3 + counter4 + counter5
-    print("Status: Proportion of adjusted occupations found directly in the variants:", counter / (counterSum + 1),
-          counter)
-    print("Status: proportion of adjusted occupations found directly in the variants NV:",
-          counter5 / (counterSum + 1),
-          counter5)
-    print("Status: Proportion of adjusted occupations found with Levensthein distance:", counter2 / (counterSum + 1),
-          counter2)
-    print("Status: Proportion of adjusted occupations found with Levensthein distance NV:", counter4 / (counterSum + 1),
-          counter4)
-    print("Status: Proportion of adjusted occupations not found", counter3 / (counterSum + 1), counter3)
-    print("Status: Proportion of empty job titles (through cleanup)", counter0 / (counterSum + 1), counter0)
-
-
-def preCreateOccuList(filename, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                      filenameVariants, filenameDistance, filenameOccu, occuKeys):
-    """
-    This function creates a list of occupational information available in a GEDCOM file.
-    :param filename: designation of the file (string)
-    :param existingVariantsKldB: data on the already classified occupation information
-    :param fieldnamesVariants: column headings of the newVariants.csv file
-    :param fieldnamesDistance: column headings of the levenshteinDistance.csv
-    :param fieldnamesOccu: column headings of the occuResult.csv file
-    :param filenameVariants: path and name of the newVariants.csv file
-    :param filenameDistance: path and name of the levenshteinDistance.csv
-    :param filenameOccu: path and name of the occuResult.csv file
-    :param occuKeys: keys for the separated professions
-    :return: list with location information
-    """
-    # a loop with one pass is necessary to be able to formulate a termination condition
-    for start in range(1):
-        # saving the name of the parallelization process
-        spawnPoolWorker = current_process().name
-
-        # loading data of a GEDCOM file
-        data = loadGedcomFile(filename)
-
-        # status information
-        print(spawnPoolWorker, "Status: The analysis of the occupational data for file", filename, "begins.")
-
-        # list of all occupations in one source
-        allOccupationsInSource = []
-
-        # iteration of each line in the GEDCOM file
-        for counter, i in enumerate(data):
-            # continue if OCCU tag is present
-            if i[2:6] == "OCCU":
-                occupation = i[7:]
-                # some files have the anomaly that the OCCU tag is empty, but the profession information is in the PLAC tag below it
-                # if this is the case, the information of the next line should be used
-                if occupation == "":
-                    occupation = data[counter + 1][7:]
-                allOccupationsInSource.append(occupation)
-
-    # function must be executed iteratively, because otherwise it is called via parallelization
-    occuList = []
-
-    # avoid dublets
-    dubletCounterDict = {}
-    avoidDublettesList = []
-
-    for i in allOccupationsInSource:
-
-        # if the variant has already been edited, it should not be edited again
-        # however, a counter should then be inplemented, which documents the number
-        if i in avoidDublettesList:  # comparison with already processed variants
-            # count up
-            dubletCounterDict.update({i: dubletCounterDict[i] + 1})
-
-            # update of the occuList
-            # searching for the right row
-            occuListPosition = next((item for item in occuList if item["variant"] == i), None)
-            occuListPosition["number"] = dubletCounterDict[i]
-        # if the occupation information has not yet been processed, then this should be done as follows
-        else:  # occupation statement for the first time in this source
-            dubletCounterDict.update({i: 1})
-            # extension of the list of processed designations
-            result = createOccuList(i, existingVariantsKldB, filename, dubletCounterDict)
-            occuList.append(result[0])  # "[0]" at the end is necessary because the function returns a list
-            avoidDublettesList.append(result[0]["variant"])
-
-    printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                  filenameVariants, filenameDistance, filenameOccu, occuKeys)
-
-    return (occuList)
-
-
-def createOccuList(phrase, existingVariantsKldB, filename, dubletCounterDict):
-    """
-    This function creates a list of location information available in a source.
-    :param phrase: occupational designation (string)
-    :param filename: designation of the file (string)
-    :param existingVariantsKldB: data on the already classified occupation information (list)
-    :param filenameVariants: path and name of the newVariants.csv file (string)
-    :param filename: designation of the file (string)
-    :param dubletCounterDict: number of same occupational designations in a source (dictionary)
-    :return: list with occupational information
-    """
-
-    # a loop with one pass is necessary to be able to formulate a termination condition
-    for start in range(1):
-        # create a list with information about the new variants
-        occuList = []  # list of unadjusted variants in the source (list entries are dictionaries with a lot of information)
-        designationList = []  # list of adjusted variants in the source
-        # if the variant has already been edited, it should not be edited again
-        # however, a counter should then be inplemented, which documents the number
-        if phrase in designationList:  # comparison with already processed variants
-            # search for the entry in the occuList that matches the variant
-            for j in occuList:
-                if j["variant"] == phrase:
-                    # count up number
-                    j["number"] = j["number"] + 1
-            # skip processing
-            continue;
-        # if the occupation information has not yet been processed, then this should be done as follows
-        else:  # occupation statement for the first time in this source
-            # extension of the list of processed designations
-            designationList.append(phrase)
-            # variant cleanup
-            resultOccucleaner = occuCleaner(phrase, existingVariantsKldB)
-            # completing the file name and setting the occurrence to 1
-            resultOccucleaner.update({"source": filename})
-            try:
-                resultOccucleaner.update({"number": dubletCounterDict[phrase]})
-            except:
-                resultOccucleaner.update({"number": 1})
-            # adding to the occuList in a dictionary
-            occuList.append(resultOccucleaner)
-
-    return (occuList)
-
-
-def printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                  filenameVariants, filenameDistance, filenameOccu, occuKeys):
-    """
-    This function creates a list of location information available in a source.
-    :param occuList: information to the occupational designations (list of dictionaries)
-    :param existingVariantsKldB: data on the already classified occupation information (list of dictionaries)
-    :param fieldnamesVariants: column headings of the newVariants.csv file (list of strings)
-    :param fieldnamesDistance: column headings of the levenshteinDistance.csv (list of strings)
-    :param fieldnamesOccu: column headings of the occuResult.csv file (list of strings)
-    :param filenameVariants: path and name of the newVariants.csv file (string)
-    :param filenameDistance: path and name of the levenshteinDistance.csv (string)
-    :param filenameOccu: path and name of the occuResult.csv file (string)
-    :param occuKeys: keys for the separated professions (list of strings)
-    :return: nothing! (only execution of print orders)
-    """
-    # loading data of new variants
-    # this is necessary every time, because an identical job title can occur in one of the parallel processes
-    newVariants = loadData(filenameVariants, "\t", "latin1")
-
-    # if the selection was made on the basis of the Levenshtein distance, this information should be saved
-    # two lists are created for this purpose
-    levenList = []  # list is used to create the content for a new row in newVariants.csv
-    levenList2 = []  # list is used to create the content for a new line in "levenshteindistance.csv
-
-    # Iteration per occupation specification in the source
-    for i in occuList:
-        # check all five possible separated professions
-        for key in occuKeys:
-            # if entry for the key is not filled in, then skip it
-            if i[key] == {}:  # "":
-                continue;
-            if i[key]["selection info"] == "found after levenshtein":
-                newDict = {
-                    "variant": i[key]["occupation"],
-                    "lemma": existingVariantsKldB[i[key]["row of best fit lemma"]]["variant"],
-                    "code": i[key]["KldB 2010"]
-                }
-                levenList.append(newDict)
-            elif i[key]["selection info"] == "found after levenshtein NV":
-                newDict = {
-                    "variant": i[key]["occupation"],
-                    "lemma": newVariants[i[key]["row of best fit lemma"]]["variant"],
-                    "code": i[key]["KldB 2010"]
-                }
-                levenList.append(newDict)
-            if i[key]["similarity analysis"] != "":  # for levenshteinDistance.csv
-                levenList2.append(i[key]["similarity analysis"])
-
-    # blocked printing of new lines in the files
-    # all files should be at the same level
-    # so if an error occurs with a variable, all files are not written to
-    # try:
-    # unpack dictionary information
-    unpackInfoList = []
-    for j in occuList:
-        # iterate j (occupations)
-        unpackInfoDict = {}
-        for i in j:
-            # contents of the dictionary are unpacked and written into individual fields
-            if type(j[i]) == dict:
-                for dictKey in j[i]:
-                    if dictKey == "absolute distance":
-                        unpackInfoDict.update(
-                            {str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["absolute distance"]})
-                    if dictKey == "relative distance":
-                        unpackInfoDict.update(
-                            {str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["relative distance"]})
-                    else:
-                        unpackInfoDict.update({str(i) + "-" + str(dictKey): j[i][dictKey]})
-                # delete "similarity analysis" if it is there
-                try:
-                    unpackInfoDict.pop(i + "-similarity analysis")
-                except:
-                    pass
-            # if it is not a dictionary, then the content is taken over like this
-            else:
-                unpackInfoDict.update({i: j[i]})
-        unpackInfoList.append(unpackInfoDict)
-
-    appendFile(filenameOccu, unpackInfoList, fieldnamesOccu)
-    appendFile(filenameVariants, levenList, fieldnamesVariants)
-    # appendFile(filenameDistance, levenList2, fieldnamesDistance)
-    # except:
-    #    print(
-    #        "Error: Blocked printing of the lines failed. Manual deletion of the entries of the last file appropriate.")
-
-
-if __name__ == '__main__':
-    # part up to 'parallelization' is executed once at the beginning
-
-    inputDataType = "ged"  # data type in which the input data is available, "ged" and "csv" are possible
-
-    # storage of the time at the beginning of the program run
-    starttime = time.perf_counter()
-
-    if inputDataType == "csv":
-        pass
-    elif inputDataType == "ged":
-        # loading the sources (exemplary here: GEDCOM files from GEDBAS)
-        # definition of the range in which the file names are located (e.g. 1.ged to 60000.ged)
-        begin = 0
-        end = 60000
-        # creation of a list with the possible file names
-        gedcomNamesList = []
-        while begin != end:
-            datename = str(begin) + ".ged"
-            gedcomNamesList.append(datename)
-            begin = begin + 1
-        # check if the files exist
-        # exclude non-existent files
-        gedcomNamesListClear = []
-        for i in gedcomNamesList:
-            # files are located in the 'data' subfolder
-            filepath = os.path.join("data", i)
-            try:
-                # if opening works, the file exists and is added to a new list
-                gedcom = open(filepath, "r", encoding="utf-8")
-                gedcom.close()
-                gedcomNamesListClear.append(i)
-            except FileNotFoundError:
-                pass
-
-    # open more context data
-    # data from the Historical Data Center of Saxony-Anhalt
-    # classification based on the Klassifikation der Berufe (KldB, Classification of Professions)
-    # data from another classification system can also be used here
-    # file contains already classified occupational variants
-    filename = os.path.join("data", "variants.csv")
-    fieldnames = ["idVariant",  # unique ID of the occupational variant
-                  "variant",  # textual representation of the variant
-                  "code"  # code of the OhdAB
-                  ]
-    # loading data from existing file
-    # if no file exists, a new one is created
-    existingVariantsKldB = createFile(filename, fieldnames, ";", "latin1")
-
-    # status message on the number of existing variants
-    print("Status:", len(existingVariantsKldB), "classified variants already exist.")
-
-    # if halving of variants is to be done for testing purposes, set halving to "yes"
-    halving = "yes"
-    # deletion of every second already classified occupation information
-    if halving == "yes":
-        remainingVariantsKldB = []
-        for number, i in enumerate(existingVariantsKldB):
-            if number % 2 == 0:
-                remainingVariantsKldB.append(i)
-        print("Status: There has been a halving of the variants for testing purposes.", len(remainingVariantsKldB),
-              "variants remain.")
-        # overwrite the variable of all variants
-        existingVariantsKldB = remainingVariantsKldB
-
-    # create file for saving the newly classified files
-    filenameVariants = os.path.join("data", "newVariants.csv")
-    fieldnamesVariants = ["variant",  # designation of the new variant of an occupation
-                          "lemma",  # existing designation of an occupation to which the new variant is assigned
-                          "code"  # code according to KldB
-                          ]
-    createFile(filenameVariants, fieldnamesVariants, "\t", "latin1")
-
-    # list about the best hits for each checked job title
-    filenameDistance = "levenshteinDistance.csv"
-    fieldnamesDistance = ["relative distance",  # absolute Levenshtein distance divided by the length of the variant
-                          "absolute distance",  # absolute Levenshtein distance
-                          "variant",  # designation of the new variant of an occupation
-                          "best fit lemma",  # designation of the best fitting existing variant
-                          "selection",  # binary information whether the lemma was selected (1 means yes, 0 means no)
-                          "lemma row"  # number of the line in the existing variants
-                          ]
-    # createFile(filenameDistance, fieldnamesDistance, "\t", "latin1")
-
-    # list for dividing the different components of a job specification
-    filenameOccu = "occuResult.csv"
-    fieldnamesOccu = ["variant",  # designation of the new variant of an occupation
-                      "source",  # name of the file in which the variant occurs (source)
-                      "number",  # Number of occurrences of the variant in the source
-                      "occupation 1-occupation",  # information about the first occupation found
-                      "occupation 1-KldB 2010",
-                      "occupation 1-best fit lemma",
-                      "occupation 1-row of best fit lemma",
-                      "occupation 1-titel",
-                      "occupation 1-role",
-                      "occupation 1-year",
-                      "occupation 1-url",
-                      "occupation 1-location",
-                      "occupation 1-further info",
-                      "occupation 1-selection info",
-                      "occupation 1-lemma row",
-                      "occupation 1-absolute distance",
-                      "occupation 1-relative distance",
-                      "occupation 2-occupation",  # information about the second occupation found0
-                      "occupation 2-KldB 2010",
-                      "occupation 2-best fit lemma",
-                      "occupation 2-row of best fit lemma",
-                      "occupation 2-titel",
-                      "occupation 2-role",
-                      "occupation 2-year",
-                      "occupation 2-url",
-                      "occupation 2-location",
-                      "occupation 2-further info",
-                      "occupation 2-selection info",
-                      "occupation 2-similarity analysis",
-                      "occupation 2-lemma row",
-                      "occupation 2-absolute distance",
-                      "occupation 2-relative distance",
-                      "occupation 3-occupation",  # information about the third occupation found
-                      "occupation 3-KldB 2010",
-                      "occupation 3-best fit lemma",
-                      "occupation 3-row of best fit lemma",
-                      "occupation 3-titel",
-                      "occupation 3-role",
-                      "occupation 3-year",
-                      "occupation 3-url",
-                      "occupation 3-location",
-                      "occupation 3-further info",
-                      "occupation 3-selection info",
-                      "occupation 3-lemma row",
-                      "occupation 3-absolute distance",
-                      "occupation 3-relative distance",
-                      "occupation 4-occupation",  # information about the fourth occupation found
-                      "occupation 4-KldB 2010",
-                      "occupation 4-best fit lemma",
-                      "occupation 4-row of best fit lemma",
-                      "occupation 4-titel",
-                      "occupation 4-role",
-                      "occupation 4-year",
-                      "occupation 4-url",
-                      "occupation 4-location",
-                      "occupation 4-further info",
-                      "occupation 4-selection info",
-                      "occupation 4-lemma row",
-                      "occupation 4-absolute distance",
-                      "occupation 4-relative distance",
-                      "occupation 5-occupation",  # information about the fifth occupation found
-                      "occupation 5-KldB 2010",
-                      "occupation 5-best fit lemma",
-                      "occupation 5-row of best fit lemma",
-                      "occupation 5-titel",
-                      "occupation 5-role",
-                      "occupation 5-year",
-                      "occupation 5-url",
-                      "occupation 5-location",
-                      "occupation 5-further info",
-                      "occupation 5-selection info",
-                      "occupation 5-lemma row",
-                      "occupation 5-absolute distance",
-                      "occupation 5-relative distance",
-                      ]
-    createFile(filenameOccu, fieldnamesOccu, "\t", "latin1")
-
-    # definition of the keys for the separated professions
-    occuKeys = ["occupation 1", "occupation 2", "occupation 3", "occupation 4", "occupation 5"]
-
-    # initialization of a list in which the results of the upcoming parallelized process are stored
-    # this will process a list of occupation details in parallel
-    # the result is a list of dictionaries containing different information about the analysis (occuList)
-    occuList = []
-
-    # parallelization
-    if inputDataType == "csv":
-        occupationsList = loadData("occupations.csv", ";", "utf-8-sig")
-        listOfOccupations = []
-        for i in occupationsList:
-            listOfOccupations.append(i["occupation"])
-    if inputDataType == "ged":
-        pass
-
-    pool = Pool(1)  # number of cores used is variable
-
-    dubletCounterDict = {}
-
-    if inputDataType == "csv":
-        # doppelte Berufsangaben zählen
-        designationList = []  # list of adjusted variants in the source
-
-        for occupation in listOfOccupations:
-            # if the variant has already been edited, it should not be edited again
-            # however, a counter should then be inplemented, which documents the number
-            if occupation in designationList:  # comparison with already processed variants
-                # count up
-                dubletCounterDict.update({occupation: dubletCounterDict[occupation] + 1})
-
-            # if the occupation information has not yet been processed, then this should be done as follows
-            else:  # occupation statement for the first time in this source
-                dubletCounterDict.update({occupation: 1})
-                # extension of the list of processed designations
-                designationList.append(occupation)
-
-        # doppelte Berufsangaben löschen
-        print("Status: Using a csv file with", len(listOfOccupations), "occupations")
-        listOfOccupations = set(listOfOccupations)
-        print("Status: File contains", len(listOfOccupations), "different occupational titles")
-
-        for row in listOfOccupations:
-            occuList.append(createOccuList(row, existingVariantsKldB, "occupations.csv", dubletCounterDict)[0])
-
-        printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                      filenameVariants, filenameDistance, filenameOccu, occuKeys)
-
-    elif inputDataType == "ged":
-
-        occuList = pool.map(partial(preCreateOccuList,
-                                    existingVariantsKldB=existingVariantsKldB,
-                                    fieldnamesVariants=fieldnamesVariants,
-                                    fieldnamesDistance=fieldnamesDistance,
-                                    fieldnamesOccu=fieldnamesOccu,
-                                    filenameVariants=filenameVariants,
-                                    filenameDistance=filenameDistance,
-                                    filenameOccu=filenameOccu,
-                                    occuKeys=occuKeys), gedcomNamesListClear)
-
-
-    else:
-        print("Error: No valide inputDataType")
-
-    pool.close()
-    pool.join()
-
-    # second processing loop for the designations that are not found but have components
-    # Example: "farmer and craftsman" is not found, but "farmer" and "craftsman" are found individually
-    print("Status: Second processing started")
-
-    # second processing
-    gedcomNamesListClear2 = []
-    # iterate all original occupation information
-    for i in occuList:
-        # iterate the five possible keys ("occupation 1", ...)
-        for key in occuKeys:
-            # if the entry for the key does not contain any content, skip it
-            try:
-                if i[key] == {}:
-                    continue
-                # only professions that are "not found"
-                if i[key]["selection info"] == "not found":
-                    gedcomNamesListClear2.append(i[key]["occupation"])
-            except:  # if it is still in a list with only one value
-                if i == [] or i[0][key] == {}:
-                    continue
-                # only professions that are "not found"
-                if i[0][key]["selection info"] == "not found":
-                    gedcomNamesListClear2.append(i[0][key]["occupation"])
-
-    # parallelization
-    pool = Pool(1)  # number of cores used is variable
-    occuList2 = pool.map(partial(createOccuList,
-                                 existingVariantsKldB=existingVariantsKldB,
-                                 dubletCounterDict=dubletCounterDict,
-                                 filename="second try"), gedcomNamesListClear2)
-
-    pool.close()
-    pool.join()
-
-    # unpack list
-    occuList2new = []
-    for oneOccu in occuList2:
-        occuList2new.append(oneOccu[0])
-    occuList2 = occuList2new
-
-    printOccuList(occuList2, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                  filenameVariants, filenameDistance, filenameOccu, occuKeys)
-
-    # the same again for a third iteration
-
-    # third processing
-    print("Status: Third processing started")
-    gedcomNamesListClear3 = []
-    # iterate all original occupation information
-    for i in occuList2:
-        # iterate the five possible keys ("occupation 1", ...)
-        for key in occuKeys:
-            # if the entry for the key does not contain any content, skip it
-            try:
-                if i[key] == {}:
-                    continue;
-                # only professions that are "not found"
-                if i[key]["selection info"] == "not found":
-                    gedcomNamesListClear3.append(i[key]["occupation"])
-            except:  # if it is still in a list with only one value
-                if i[0][key] == {}:
-                    continue
-                # only professions that are "not found"
-                if i[0][key]["selection info"] == "not found":
-                    gedcomNamesListClear3.append(i[0][key]["occupation"])
-
-    # parallelization
-    pool = Pool(1)  # number of cores used is variable
-    occuList3 = pool.map(partial(createOccuList,
-                                 existingVariantsKldB=existingVariantsKldB,
-                                 dubletCounterDict=dubletCounterDict,
-                                 filename="third try"), gedcomNamesListClear3)
-
-    pool.close()
-    pool.join()
-
-    # unpack list
-    occuList3new = []
-    for oneOccu in occuList3:
-        occuList3new.append(oneOccu[0])
-    occuList3 = occuList3new
-
-    printOccuList(occuList3, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                  filenameVariants, filenameDistance, filenameOccu, occuKeys)
-
-    # creation of statistics for the three iterations
-    statistics(occuList, occuKeys)
-    statistics(occuList2, occuKeys)
-    statistics(occuList3, occuKeys)
-
-    # storage of the time at the ending of the program run
-    finishtime = time.perf_counter()
-
-    # status info
-    print("Status: Program finished in", round(finishtime - starttime, 2), "seconds(s)")
diff --git a/2022_005_goldberg/Skripte/Scraper/main.py b/2022_005_goldberg/Skripte/Scraper/main.py
deleted file mode 100644
index 7334d6e3c5339ce47a7f91be620e7c035c883d5f..0000000000000000000000000000000000000000
--- a/2022_005_goldberg/Skripte/Scraper/main.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Ignore the following warning: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gedbas.genealogy.net'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings
-import urllib3
-
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-# Scraper based on introduction on https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460
-# published from Julia Kho on 27 September 2019, last open on 2 July 2021
-
-# import of libraries
-import requests
-from bs4 import BeautifulSoup
-import io
-
-# initialisation of counters
-number = 0
-found = 0
-notAllowed = 0
-notFound = 0
-notFoundSeries = 0
-end = 10000
-empty = 0
-
-# possible GEDCOM files were determined by varying the URL
-# the end of the URL is changed with a number that is incremented
-# execution of the iteration until no result is found 10000 times (value of variable "end")
-while notFoundSeries != end:
-    # definition of the URL
-    url = "https://gedbas.genealogy.net/gedcom/export/" + str(number)
-    try:
-        # scraping the information
-        response = requests.get(url, verify=False)
-        gedcom = str(BeautifulSoup(response.text, "html.parser"))
-        # analysis of the information
-        # with "not allowed" no public access to the file is possible
-        # with "not found", the file is not (no longer) available
-        # in all other cases the information is written to a GEDCOM file
-        if gedcom != "not found" and gedcom != "not allowed" and gedcom != "":
-            filename = str(number) + ".ged"
-            file = io.open(filename, "w", encoding="utf-8")
-            file.write(gedcom)
-            file.close()
-            # count number of detected files
-            found = found + 1
-            # resetting the counter that counts the number of unsuccessful calls in series
-            notFoundSeries = 0
-        # count number of not allowed files
-        if gedcom == "not allowed":
-            notAllowed = notAllowed + 1
-            notFoundSeries = 0
-        # count number of not more existing files
-        if gedcom == "not found":
-            notFound = notFound + 1
-            notFoundSeries = notFoundSeries + 1
-        # count number of empty gedcom files
-        if gedcom == "":
-            empty = empty + 1
-            notFoundSeries = 0
-    except:
-        print("Status: There is an error in file " + str(number) + ".")
-    # gives info every 1000 urls
-    if number % 1000 == 0:
-        print("Status:", str(number) + " urls were analyzed")
-    # count up per analysed URL
-    number = number + 1
-
-# printing status information
-print("Status: Scraping finished")
-print("Status: " + str(found) + " files could be found")
-print("Status: Access was denied for " + str(notAllowed) + " files")
-print("Status: " + str(notFound - end) + " files were deleted")
-print("Status: " + str(empty) + " files were blank")
diff --git a/2022_005_goldberg/Skripte/main.py b/2022_005_goldberg/Skripte/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f26d448a148d6bca6cde05ad7cc58b123f49a52
--- /dev/null
+++ b/2022_005_goldberg/Skripte/main.py
@@ -0,0 +1,291 @@
+import qualitychecker
+import placefinder
+import provincefinder
+import csv
+import os.path
+from multiprocessing import Pool, current_process
+from functools import partial
+import time
+from zeep import Client
+import json
+
+
+def importMiniGOV():
+    """
+    This function loads the Mini-GOV that is located in the data folder.
+    This is used to assign a location to a standard notation.
+    :return: list of Mini-GOV entries
+    """
+    # Information from http://wiki-de.genealogy.net/GOV/Mini-GOV, 23.03.2020
+    # german language designation, because those in the Mini-GOV are also in German language
+    miniGOVHeaders = ["GOV-Kennung", "Objekttyp als Text", "Objekttyp als Zahl", "aktueller Name",
+                      "letzter deutscher Name", "Staat", "adm. Zuordnung 1", "adm. Zuordnung 2", "adm. Zuordnung 3",
+                      "adm. Zuordnung 4", "Postleitzahl", "geographische Breite", "geographische Länge"]
+
+    # embedding the Mini-GOVs of different countries (Germany, Poland, Austria, Switzerland, Czech Republic, Denmark, France, Netherlands)
+    miniGOVFiles = ["gov-data_D_20190325_201241.txt", "gov-data_PL_20190325_201241.txt",
+                    "gov-data_A_20190325_201241.txt", "gov-data_CH.txt", "gov-data_CZ_20190325_201241.txt",
+                    "gov-data_DK.txt", "gov-data_F_20190325_201241.txt", "gov-data_NL.txt"]
+
+    miniGOV = []  # initialize list with entries of Mini-GOV
+    # for each named Mini-GOV file the data is loaded and merged into miniGOV
+    for i in miniGOVFiles:
+        filepath = os.path.join("data", i)
+        with open(filepath, encoding="utf-8") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter="\t", fieldnames=miniGOVHeaders)
+            miniGOV = miniGOV + list(reader)
+            csvfile.close()
+
+    # expand of miniGOV because some Mini-GOVs contain "previous german names"
+    # for each former German name an entry is also created so that it can be found
+    expandMiniGOV = []  # initialize
+    for i in miniGOV:
+        if i["letzter deutscher Name"] != "":
+            i["aktueller Name"] = i["letzter deutscher Name"]  # overwrite the current name with the last german name
+        else:
+            continue;  # do not append
+        expandMiniGOV.append(i)
+
+    # merge miniGOV and expandMiniGOV
+    miniGOV = miniGOV + expandMiniGOV
+
+    # alphabetical sorting of miniGOV in relation to the column with the name of the place
+    # all designations are written in lower case
+    # .lower() is extremely important here, because otherwise capital letters are preferred over small ones and such cases occur in the GOV (e.g. some places starting with IJ, IJselstein)
+    miniGOV = (sorted(miniGOV, key=lambda x: (x["aktueller Name"].lower())))
+    return (miniGOV)
+
+
+def loadData(filename, delimiter, encoding):
+    """
+    This function opens files in which data is temporarily stored and was created by the program in a previous run.
+    :param filenname: name of the file
+    :param delimiter: string of delimiter
+    :return: content of the file as a list of lists; if no file exists an empty list
+    """
+    emptyList = []  # list oft dicts
+    try:
+        with open(filename, "r", encoding=encoding) as data:
+            for i in csv.DictReader(data, delimiter=delimiter):
+                i = json.loads(json.dumps(i))
+                emptyList.append(i)
+    except FileNotFoundError:
+        print("Status: Initial run, do not create a list yet:", filename)
+    return (emptyList)
+
+
+def appendFile(filename, data, fieldnames, moreThanOneRow):
+    """
+    This function adds a line to an existing file.
+    :param filename: name of the file
+    :param data: content of the line to be added
+    :param fieldnames: column headers of the file
+    :param moreThanOneRow: if it equals 0 so one row are printed or equals 1 more row are printed (integer)
+    """
+    openQualityChecker = open(filename, "a", newline="", encoding="utf-8")
+    writerQualityChecker = csv.DictWriter(openQualityChecker, fieldnames=fieldnames, delimiter="\t")
+    # distinction between adding data to "quality.csv" file and other files
+    # with "quality.csv" only one row is added, with all others several rows
+    if moreThanOneRow == 0:
+        writerQualityChecker.writerow(data)
+    else:
+        writerQualityChecker.writerows(data)
+    openQualityChecker.close()
+
+
+def createFile(filename, fieldnames, delimiter, encoding):
+    """
+    This function is used to create files if they do not yet exist.
+    But if they already exist, the existing content will be loaded.
+    :param filename: name of the file
+    :param fieldnames: column headers of the file
+    :return: loaded data; if there is no data, an empty list is returned
+    """
+    # load existing content
+    loadedData = loadData(filename, delimiter, encoding)
+    # create a new file if it is not there
+    if len(loadedData) == 0:  # only if loadedData is an empty list a new file is created
+        openQualityChecker = open(filename, "w", newline="", encoding="utf-8")
+        writerQualityChecker = csv.writer(openQualityChecker, delimiter=delimiter)
+        writerQualityChecker.writerow(fieldnames)
+        openQualityChecker.close()
+    return (loadedData)
+
+
+def loadGedcomFile(datename):
+    """
+    This function loads the data from a single GEDCOM file.
+    If the sources are not in GEDCOM format, this area must be adjusted.
+    :param datename: name of source (here GEDCOM file)
+    :return: list containing one entry per line of a GEDCOM file; if the file cannot be found "NONE" is returned
+    """
+    filepath = os.path.join("data", datename)
+    line = []  # initialize empty list
+    try:
+        gedcom = open(filepath, "r", encoding="utf-8")
+        data = gedcom.readline()
+        # delete the last character of each line, which is a space
+        data = data[:-1]
+        # the last line is empty, so the lines are processed until this empty line appears
+        while data != "":
+            data = str(gedcom.readline())
+            data = data[:-1]
+            line.append(data)
+        gedcom.close()
+        return (line)
+    except FileNotFoundError:
+        print("Error: Problem with access to file", datename, ".")
+        return ("NONE")
+
+
+def parallel(filename, miniGovList, qualityDict, fieldnamesStep1, fieldnamesStep2, fieldnamesStep3, filenameStep1,
+             filenameStep2, filenameStep3):
+    """
+    This function is called once per source (here GEDCOM file).
+    The process consists of three steps.
+    First, a metadata analysis is performed, the result of which can be found in the file "quality.csv".
+    Then the urban names are subjected to identification.
+    In the third step, regional clustering is performed at a defined time.
+    The goal is to extend the files "quality.csv", "placefinder.csv" and "provincesdict.csv".
+    :param filename: name of the file/source
+    :param miniGovList: list of merged entries of the Mini-GOV
+    :param qualityDict: Metadata about the data from previous program runs
+    :param fieldnamesStep1: name of the columns of the file "quality.csv"
+    :param fieldnamesStep2: name of the columns of the file "placefinder.csv"
+    :param fieldnamesStep3: name of the columns of the file "provincesdict.csv"
+    :param filenameStep1: string of the file name "quality.csv"
+    :param filenameStep2: string of the file name "placefinder.csv"
+    :param filenameStep3: string of the file name "provincesdict.csv"
+    """
+    # a loop with one iteration is used here to formulate a termination condition
+    for i in range(1):
+        # note the number of the parallelization process
+        spawnPoolWorker = current_process().name
+
+        # load data of a GEDCOM file
+        # must be changed if source is not a GEDCOM file
+        data = loadGedcomFile(filename)
+
+        # Step 1: Metadata/Quality analysis
+        print(spawnPoolWorker, "Status: Metadata analysis of", filename, "begins.")
+        resultQualityChecker = qualitychecker.mainMetadataInspector(data, filename, miniGovList, qualityDict)
+        if resultQualityChecker == "StartingExitStrategy":
+            print(spawnPoolWorker, "Status: The data to file", filename, "is complete.")
+            continue  # check next file
+
+        # Step 2: Identification
+        print(spawnPoolWorker, "Status: Identifying the places of", filename, "begins.")
+        resultPlaceFinder = placefinder.mainPlaceFinder(data, resultQualityChecker, filename, miniGovList)
+
+        # Step 3: Clustering
+        print(spawnPoolWorker, "Status: Clustering of the places of", filename, "begins.")
+        # definition of a year at which the administrative clustering should take place
+        referencetime = 1800
+        # a working internet connection is necessary
+        client = Client("https://gov.genealogy.net/services/ComplexService?wsdl")
+        resultProvinceFinder = provincefinder.mainProvinceFinder(resultPlaceFinder, filename, client, referencetime)
+
+        # blocked file extension
+        # happens because if an output is included, all files are still at the same file level
+        try:
+            appendFile(filenameStep1, resultQualityChecker, fieldnamesStep1, 0)  # only one row
+            appendFile(filenameStep2, resultPlaceFinder, fieldnamesStep2, 1)
+            appendFile(filenameStep3, resultProvinceFinder, fieldnamesStep3, 1)
+        except:
+            print("Error: Blocked printing of lines failed. Manual deletion of the last entries in the files attached.")
+
+
+if __name__ == "__main__":
+    """
+    This construction exists to prepare the parallelization.
+    The section up to the comment "start of parallelization" is executed only once. 
+    It is used to load the location data from the source (here GEDCOM files) and create CSV files initially.
+    """
+    # memorizing the start time
+    starttime = time.perf_counter()
+
+    # define range of GEDCOM data
+    # assume that the GEDCOM files are in 12345.ged format
+    begin = 0  # starts at 0.ged
+    end = 60000  # ends at 60000
+    gedcomNamesList = []  # creation of a list with possible GEDCOM file names
+    while begin != end:
+        datename = str(begin) + ".ged"  # name of GEDCOM file
+        gedcomNamesList.append(datename)
+        begin = begin + 1
+
+    # possibility that not all files of the gedcomNamesList exist
+    # do not let the non-existent files into the multiprocessing
+    # check the existence of the files
+    gedcomNamesListClear = []  # version of gedcomNamesList, which contains only existing files
+    for i in gedcomNamesList:
+        filepath = os.path.join("data", i)  # GEDCOM files are located in the subfolder "data"
+        try:
+            gedcom = open(filepath, "r", encoding="utf-8")
+            gedcom.close()
+            gedcomNamesListClear.append(i)
+        except FileNotFoundError:
+            pass
+
+    # Loading data from the Mini-GOV
+    miniGovList = importMiniGOV()
+
+    # initialization of CSV files, which are needed in the further course
+    filenameStep1 = "quality.csv"
+    fieldnamesStep1 = ["filename",  # name of GEDCOM file
+                       "number of places",
+                       "number of noHit",
+                       "number of moreThanOneHit",
+                       "number of definitely coordinates",
+                       "longitude mean of of definitely coordinates",
+                       "latitude mean of of definitely coordinates",
+                       "number of existing clusters",
+                       "number of relevant clusters",
+                       "cluster midpoints"  # list of lists of geographical centers of individual clusters
+                       ]
+    # load already existing data into a variable
+    qualityDict = createFile(filenameStep1, fieldnamesStep1, "\t", "utf-8")
+
+    # list of all Urbanoyme per source
+    filenameStep2 = "placefinder.csv"
+    fieldnamesStep2 = ["id",  # GOV-ID of a place
+                       "latitude",  # latitude of the place
+                       "longitude",  # longitude of the place
+                       "selection information",  # description of the identification of this urbanonym
+                       "adjusted name",  # adjusted spelling of the urbanonym in the source
+                       "original name",  # original spelling of the urbanonym in the source
+                       "filename"  # name of the file where the urbanonym is found
+                       ]
+    createFile(filenameStep2, fieldnamesStep2, "\t", "utf-8")
+
+    # list of urban names already assigned to a province per file to avoid double searches
+    filenameStep3 = "provincesdict.csv"
+    fieldnamesStep3 = ["original name",  # original spelling of the urbanonym in the source
+                       "filename",  # name of the file where the urbanonym is found
+                       "id",  # GOV-ID of a place
+                       "province"  # name of assigned administrative unit
+                       ]
+    createFile(filenameStep3, fieldnamesStep3, "\t", "utf-8")
+
+    # start of parallelization
+    # executes the function "parallel" per entry in the list gedcomNamesListClear (per urbanonym)
+    # parallelization is realized to shorten the processing time
+    pool = Pool()
+    pool.map(partial(parallel,
+                     miniGovList=miniGovList,
+                     qualityDict=qualityDict,
+                     fieldnamesStep1=fieldnamesStep1,
+                     fieldnamesStep2=fieldnamesStep2,
+                     fieldnamesStep3=fieldnamesStep3,
+                     filenameStep1=filenameStep1,
+                     filenameStep2=filenameStep2,
+                     filenameStep3=filenameStep3, )
+             , gedcomNamesListClear)
+    pool.close()
+    pool.join()
+
+    # memorizing the time of finishing
+    finishtime = time.perf_counter()
+
+    # print the duration of the program run
+    print("Finished in", round(finishtime - starttime, 2), "seconds(s)")
diff --git a/2022_005_goldberg/Skripte/occupationMain.py b/2022_005_goldberg/Skripte/occupationMain.py
deleted file mode 100644
index d90b58d4fee674e6d8ba70259e218b985281754d..0000000000000000000000000000000000000000
--- a/2022_005_goldberg/Skripte/occupationMain.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-import csv
-import os.path
-from multiprocessing import Pool, current_process
-from functools import partial
-import time
-import json
-import Levenshtein
-import copy
-
-
-def loadData(filename, delimiter, encoding):
-    """
-    This function is used to open files in which data is temporarily stored and was created by the program in a previous run.
-    :param filename: designation of the file
-    :param delimiter: type of delimiter as string
-    :return: list of dictionaries with information of the file to be loaded
-    """
-    content = []  # list oft dicts
-    try:
-        with open(filename, "r", encoding=encoding) as data:  # , errors='ignore'
-            for i in csv.DictReader(data, delimiter=delimiter):
-                i = json.loads(json.dumps(i))
-                content.append(i)
-    except FileNotFoundError:
-        print("Status: Inital pass for file", filename, "(no list created yet).")
-    return (content)
-
-
-def appendFile(filename, data, fieldnames):
-    """
-    This function describes CSV files.
-    :param filename: designation of the file (string)
-    :param data: type of delimiter (string)
-    :param fieldnames: column names of the file to be written (list of strings)
-    """
-    opener = open(filename, "a", newline='', encoding="utf-8")
-    writer = csv.DictWriter(opener, fieldnames=fieldnames, delimiter="\t")
-    # differentiation of cases where one or more lines are to be added
-    # for the file "qualityofgedcom.csv" only one line should be written at a time
-    # for all other files several lines should be written
-    if filename == "qualityofgedcom.csv":
-        writer.writerow(data)
-    else:
-        writer.writerows(data)
-    opener.close()
-
-
-def createFile(filename, fieldnames, delimiter, encoding):
-    """
-    This function creates a new file if no file already exists under this name.
-    The function is also used to load data when it is clear that the file already exists.
-    :param filename: designation of the file (string)
-    :param fieldnames: column names of the file to be written (list of strings)
-    :param delimiter: type of delimiter (string)
-    :return: list of dictionaries with information of the file to be loaded
-    """
-    content = loadData(filename, delimiter, encoding)
-    # create a new file if it is not there
-    if len(content) == 0:  # check if the variable does not contain any data
-        opener = open(filename, "w", newline='', encoding="utf-8")
-        writer = csv.writer(opener, delimiter=delimiter)
-        writer.writerow(fieldnames)
-        opener.close()
-    return (content)
-
-
-def loadGedcomFile(filename):
-    """
-    This function loads the data of a GEDCOM file and writes them line by line into a list.
-    :param filename: name of the file
-    :return: in case of error "NONE", otherwise a list with the information of the GEDCOM file
-    """
-    # define file path
-    filepath = os.path.join("data", filename)
-    preparedData = []
-    try:
-        gedcom = open(filepath, "r", encoding="utf-8")
-        data = gedcom.readline()
-        # initial transfer of the headline
-        data = data[:-1]  # delete the unimportant last character of each line
-        while data != "":  # last line is empty
-            data = str(gedcom.readline())
-            data = data[:-1]  # delete the unimportant last character of each line
-            preparedData.append(data)
-        gedcom.close()
-        return (preparedData)
-    except FileNotFoundError:
-        print("Error: There is a problem with access to the file", filename, ".")
-        return ("NONE")
-
-
-def separator(occu, replaced, replacer):
-    """
-    This function is used to replace separation operators.
-    :param occu: string that is processed
-    :param replaced: content to be replaced
-    :param replacer: place of the one to be replaced
-    :return: new string with changed content
-    """
-    if replaced in occu:
-        occu = occu.replace(replaced, replacer)
-    return (occu)
-
-
-def endOfString(phrase, signalWord):
-    """
-    This function is used to detect the position of an element of a string.
-    The respective end position of a part is determined, if it exists.
-    Everything before this position is removed.
-    :param phrase: string to be searched (string)
-    :param signalWord: displays a place name (string)
-    :return: text after the end position of the signal word in the phrase
-    """
-    # if phrase contains the signal word, then find end position of the signal word and remove everything behind
-    if signalWord in phrase:
-        endOfString = phrase[(phrase.find(signalWord) + len(signalWord)):]
-        return (endOfString)
-    return ("")
-
-
-def replaceLoc(signalWord, phrase, loc):
-    """
-    This function is used to store location names.
-    :param signalWord: displays a place name (string)
-    :param phrase: string to be searched (string)
-    :param loc: designation of a place (string)
-    :return: adjusted occupation phrase
-    """
-    if signalWord in phrase:
-        phrase = phrase.replace(signalWord, "")  # remove "signalWord"
-        phrase = phrase.replace(loc, "")  # remote location
-    return (phrase)
-
-
-def dictSearch(relevantDict, key, relevantObject):
-    """
-    This function searches a given list of dictionaries for a searched value and specifies the key.
-    :param relevantDict: list of dictionaries that will be searched
-    :param key: key of the dictionary to be studied
-    :param relevantObject: name of the value to be searched for under the key in the Dictionary
-    :return: number of the searched dictionary in the list (if none is found "-1")
-    """
-    # search per list comprehension
-    # note: upper and lower case is relevant here
-    occuIndex = next((index for (index, d) in enumerate(relevantDict) if d[key] == relevantObject), None)
-    if occuIndex is None:
-        return (-1)  # if it could not be found
-    return (occuIndex)
-
-
-def partCorrector(phrase, existingVariantsKldB):
-    """
-    This function cleans up a location specification.
-    Information that is not related to the location will be filtered out.
-    In addition, an attempt is made to find a lemma for this occupation.
-    :param phrase: occupation (string)
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :return: information about the occupation (dictionary)
-    """
-    # initialization of variables, so that exist
-    titel = ""
-    role = ""
-    year = ""
-    url = ""
-    brackets = ""
-
-    # step 5: geographic prepositions ("loc" stands for location)
-    # find and save place names
-    # place name is currently overwritten if several of them occur
-    # there are signal words that indicate a subsequent location (e.g. "in", "im")
-    loc = endOfString(phrase, " in ")  # "loc" is needed for the upcoming function
-    phrase = replaceLoc(" in ", phrase, loc)
-    loc = endOfString(phrase, " im ")
-    phrase = replaceLoc(" im ", phrase, loc)
-    loc = endOfString(phrase, " In ")
-    phrase = replaceLoc(" In ", phrase, loc)
-    loc = endOfString(phrase, " i. ")
-    phrase = replaceLoc(" i. ", phrase, loc)
-    loc = endOfString(phrase, " von ")
-    phrase = replaceLoc(" von ", phrase, loc)
-    loc = endOfString(phrase, " v. ")
-    phrase = replaceLoc(" v. ", phrase, loc)
-    loc = endOfString(phrase, " zu ")
-    phrase = replaceLoc(" zu ", phrase, loc)
-    loc = endOfString(phrase, " auf ")
-    phrase = replaceLoc(" auf ", phrase, loc)
-    loc = endOfString(phrase, " aus ")
-    phrase = replaceLoc(" aus ", phrase, loc)
-    loc = endOfString(phrase, " Aus ")
-    phrase = replaceLoc(" Aus ", phrase, loc)
-    loc = endOfString(phrase, " an ")
-    phrase = replaceLoc(" an ", phrase, loc)
-    loc = endOfString(phrase, " der ")
-    phrase = replaceLoc(" der ", phrase, loc)
-    loc = endOfString(phrase, " des ")
-    phrase = replaceLoc(" des ", phrase, loc)
-    loc = endOfString(phrase, " van ")
-    phrase = replaceLoc(" van ", phrase, loc)
-
-    # besides location information there are signal words for employers
-    # "loc" continues to be used here, even though the literal sense no longer fits here
-    loc = endOfString(phrase, " bei ", )
-    phrase = replaceLoc(" bei ", phrase, loc)
-    loc = endOfString(phrase, " bei dem ")
-    phrase = replaceLoc(" bei dem ", phrase, loc)
-    loc = endOfString(phrase, " beim ")
-    phrase = replaceLoc(" beim ", phrase, loc)
-    loc = endOfString(phrase, " bei der ")
-    phrase = replaceLoc(" bei der ", phrase, loc)
-
-    # then there are signal words in front of an occupation, which makes clear the affiliation to a dominion
-    affiliation = ["herrschaftlich", "herrschaftliche", "herrschaftlicher", "königlich", "königliche", "königlicher",
-                   "fürstlich", "fürstliche", "fürstlicher"]
-    for i in affiliation:
-        if i in phrase:
-            # this information should not be deleted from the occupation statement
-            # it should only be stored in "loc" to be output separately afterwards
-            # if "loc" is empty, then no comma should precede it
-            if loc != "":
-                loc = loc + ", " + i
-            else:
-                loc = i
-
-    # find and save years
-    # more detailed dates are made to year information
-    # assumption: Year numbers always have four digits and are at the beginning
-    # check if the first character is a number
-    if phrase[:1].isdigit() is True:
-        # check if the first four characters are a number
-        if phrase[:4].isdigit() is True:
-            # separate year and part behind
-            year = phrase[:4]
-            phrase = phrase[4:]
-
-    # brackets content
-    if "(" in phrase and ")" in phrase:
-        brackets = phrase[phrase.find("("):phrase.find(")")]
-        phrase = phrase[:phrase.find("(")] + phrase[phrase.find(")") + 2:] # +2 because of parenthesis and space
-    if "[" in phrase and "]" in phrase:
-        brackets = phrase[phrase.find("["):phrase.find("]")]
-        phrase = phrase[:phrase.find("[")] + phrase[phrase.find("]") + 2:] # +2 because of parenthesis and space
-
-    # find and save URLs
-    # example: <a href="https:undde.wikipedia.org/wiki/Geschichte_des_Kantons_Thurgau#Grafen_im_Thurgau">Graf im Thurgau</a>
-    if "<a" in phrase and "</a>" in phrase:
-        url = phrase[phrase.find("<a"):phrase.find("</a>")]
-        phrase = phrase[:phrase.find("<a")] + phrase[phrase.find("</a>"):]
-
-    # find and save role
-    # wife
-    if "F. d." in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "F. d.")
-    if "Ehefrau des" in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau des")
-    if "Ehefrau d." in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau d.")
-    if "Ehefrau" in phrase:
-        role = "Frau"
-        phrase = endOfString(phrase, "Ehefrau")
-    if "frau" in phrase and "Haus" != phrase[:4] and "Acker" != phrase[:5]:
-        role = "Frau"
-        phrase = phrase.replace("sfrau", "")
-        phrase = phrase.replace("frau", "")
-    # daugther
-    if "T. d." in phrase:
-        role = "Tochter"
-        phrase = endOfString(phrase, "T. d.")
-    if "tochter" in phrase:
-        role = "Tochter"
-        phrase = phrase.replace("stochter", "")
-        phrase = phrase.replace("tochter", "")
-    # son
-    if "S. d." in phrase:
-        role = "Sohn"
-        phrase = endOfString(phrase, "S. d.")
-    if "sohn" in phrase:
-        role = "Sohn"
-        phrase = phrase.replace("ssohn", "")
-        phrase = phrase.replace("sohn", "")
-
-    # find and save titles
-    if "Prof." in phrase:
-        titel = "Professor"
-        phrase = endOfString(phrase, "Prof.")
-    if "Professor" in phrase:
-        titel = "Professor"
-        phrase = endOfString(phrase, "Professor")
-
-    # step 9: temporal prepositions and numerals
-    if " am " in phrase:
-        year = endOfString(phrase, " am ")
-        phrase = phrase.replace(" am ", "")
-        phrase = phrase.replace(year, "")
-    if " bis " in phrase:
-        year = endOfString(phrase, " bis ")
-        phrase = phrase.replace(" bis ", "")
-        phrase = phrase.replace(year, "")
-
-    # delete numbers, unless they end with a dot or there are 4 consecutive digits, then this is taken as year
-    numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
-    numberLength = 0
-    prePart = phrase
-    for i in range(len(phrase)):
-        if prePart[i:i + 1] in numbers:
-            numberLength = numberLength + 1
-            if prePart[i + 1:i + 2] != "." and prePart[i + 1:i + 2] not in numbers:
-                if numberLength == 4:
-                    year = prePart[i - 3:i + 1]
-                    phrase = phrase.replace(year, "")
-                    numberLength = 0
-                else:
-                    phrase = phrase.replace(phrase[i - numberLength + 1:i + 1], "")
-                    numberLength = 0
-            elif phrase[i + 1:i + 2] == ".":
-                numberLength = 0
-
-    # remove remaining special characters
-    phrase = phrase.replace(":", "")
-
-    # remove blanks here again
-    # "cleanedOccupation" is what remains of the occupation specification
-    cleanedOccupation = phrase.strip()
-
-    # search if there is a corresponding pedant in the already classified occupational data
-    occuIndex = dictSearch(existingVariantsKldB, "Variante", cleanedOccupation)
-    # if occuIndex is not "-1", then a counterpart was found
-    if occuIndex != -1:
-        # KldB identifier
-        kldb = existingVariantsKldB[occuIndex]["OhdAB_01"]
-        # way of selection of a counterpart
-        info = "found direct"
-        # levDict stands in front of Levenshtein dictionary
-        # name not appropriate here, because no Levenshtein distance is used
-        # for uniformity of the variable it is used anyway
-        levDict = {"lemma row": occuIndex,  # line of the matching dictionary
-                   "variant": "",
-                   "best fit lemma": existingVariantsKldB[occuIndex]["Variante"],
-                   # designation of the appropriate occupation
-                   "absolute distance": "",
-                   "relative distance": "",
-                   "selection": ""
-                   }
-    # if occuIndex is "-1", no counterpart was found and a similarity analysis starts
-    elif occuIndex == -1 and cleanedOccupation != "":  # cleanedOccupation must not be empty
-        # similarity analysis
-        levDict = levenshteinDist(existingVariantsKldB, "Variante", cleanedOccupation, "OhdAB_01")
-        # setting the relative Levenshtein distance of 0.25 as the essential threshold for selection
-        if levDict["relative distance"] < 0.25:
-            levDict.update({"selection": 1})
-            kldb = existingVariantsKldB[levDict["lemma row"]]["OhdAB_01"]  # take the line here from the levDict
-            # way of selection of a counterpart
-            info = "found after levenshtein"
-        else:
-            # no counterpart found
-            levDict.update({"selection": 0})
-            kldb = ""
-            info = "not found"
-    # no occupation remains
-    else:
-        kldb = ""
-        info = "no occupational designation"
-        levDict = {"lemma row": "", "variant": "", "best fit lemma": "", "absolute distance": "",
-                   "relative distance": "", "selection": ""}
-
-    # store the information sorted for each phrase (occupation)
-    occupationResult = {
-        "occupation": cleanedOccupation,
-        "best fit lemma": levDict["best fit lemma"],
-        "row of best fit lemma": levDict["lemma row"],
-        "KldB 2010": kldb,
-        "titel": titel,
-        "role": role,
-        "location": loc,
-        "year": year,
-        "url": url,
-        "further info": brackets,
-        "selection info": info,
-        "similarity analysis": levDict
-    }
-    return (occupationResult)
-
-
-def abbreviationsCorrector(firstString, secondString):
-    """
-    This function compares two phrases and checks if one of them could be an abbreviation of the other.
-    If "s"econdString" is an abbreviation of "firstString", "firstString" will be returned truncated.
-    :param firstString: first phrase without abbreviation (string)
-    :param secondString: second phrase with abbreviation (string)
-    :return: resolved abbreviation of "firstString" (string)
-    """
-    # continue only if there is a dot in "secondString"
-    # first letters equal to runtime improvement
-    if "." in secondString and secondString[:1] == firstString[:1]:
-        positionDot = secondString.find(".")
-        # find the abbreviated part in the other string and delete it in the original name
-        # count backwards to find blanks
-        for position in range(positionDot, 0, -1):
-            if secondString[positionDot:positionDot + 1] == " ":
-                beforeDot = secondString[position:positionDot]
-                break;
-            elif position == 1:
-                beforeDot = secondString[:positionDot]
-
-        # testing minimum length
-        try:
-            # minimum length before 3 letters
-            if positionDot - position < 4:
-                # if less than three letters, return original value
-                return (firstString)
-        except UnboundLocalError:
-            position = 0
-            beforeDot = secondString[position:positionDot]
-            # minimum length before 3 letters
-            if positionDot - position < 4:
-                # if less than three letters, return original value
-                return (firstString)
-
-        if beforeDot in firstString:
-            positionPart = firstString.find(beforeDot) + len(beforeDot)
-            for position in range(positionPart, len(firstString) + 1):
-                # blank, hyphen or general end; +1 is allowed here, is then simply empty
-                if firstString[position:position + 1] == " " or firstString[position:position + 1] == "-" or position == len(
-                        firstString):
-                    positionEnd = position
-                    break;
-            # abbreviation found, abbreviate original name
-            firstString = firstString[:positionPart] + ". " + firstString[positionEnd:]
-    return (firstString)
-
-
-def levenshteinDist(existingVariantsKldB, key, relevantObject, keyRelevantDict):
-    """
-    This function generates the Levenshtein distance between two strings.
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :param key: designation of the key for the "relevantDict" (string)
-    :param relevantObject: occupation for which a similar, already classified value is to be found (string)
-    :param keyRelevantDict: # vame of the column that contains for the identifier (string)
-    :return: information on similarity analysis (dictionary)
-    """
-    # the best fitting value is to be found
-    # initial high values for a Levenshtein distance, which are undercut in any case
-    minimalDistAbs = 99999  # absolute
-    minimalDistRel = 99999  # relative
-    # binary variable, 0 if no hit was found, 1 if at least one hit was found
-    minOneFound = 0
-    # check against each existing entry
-    for counter, i in enumerate(existingVariantsKldB):
-        # Lower case for better comparability
-        relevantObjectLowerCase = relevantObject.lower()
-        existingVariantLowerCase = copy.copy(
-            i[keyRelevantDict]).lower()  # copy important because it is changed afterwards
-        # compare only if first letters are the same (serves to improve runtime)
-        if existingVariantLowerCase[:1] == relevantObjectLowerCase[:1]:
-            # calculate Levenshtein distance
-            levDistAbs = Levenshtein.distance(existingVariantLowerCase, relevantObjectLowerCase)
-            # levDist multiply with number of blanks (+1) to avoid "gewesener königlicher Richter"/"gewesener königlicher Koch"
-            levDistRel = levDistAbs * (relevantObject.count(" ") + 1) / len(relevantObject)
-            # when the next one fits better
-            if levDistRel < minimalDistRel:
-                minimalDistAbs = levDistAbs
-                minimalDistRel = levDistRel
-                bestFitLemma = i[keyRelevantDict]
-                cacheCounter = counter
-                # is overwritten until an equal one comes along
-                hitlist = [[i[keyRelevantDict], cacheCounter]]
-            # if the next one fits equally well
-            if levDistRel == minimalDistRel:
-                hitlist.append([i[keyRelevantDict], counter])
-            # at least one hit
-            minOneFound = 1
-        # no similarity
-        else:
-            continue;
-
-    # select one in case of multiple hits
-    # selection is made by greatest match from the front (matching letters)
-    try:
-        # if there were several hits of the same quality
-        # anything above 0.25 is assumed to be unrealistic here, serves to improve runtime
-        if len(hitlist) > 1 and minimalDistRel < 0.25:
-            # initialization of counters
-            numberMatchingChars = 0
-            maxNumberMatchingChars = 0
-            numberMatchingCharsList = []
-            for charPosition, j in enumerate(hitlist):
-                # if the respective letters of the strings to be compared are the same
-                if j[0][charPosition:charPosition + 1] == relevantObject[charPosition:charPosition + 1]:
-                    # count up
-                    numberMatchingChars = numberMatchingChars + 1
-                    # note the maximum number of characters
-                    maxNumberMatchingChars = numberMatchingChars
-                # reset, if another character comes
-                else:
-                    numberMatchingChars = 0
-                numberMatchingCharsList.append([charPosition, maxNumberMatchingChars])
-
-            # Selection of the result with the closest match (no longer has anything to do with Levenshtein distance)
-            longestMatch = 0
-            # iterate all results of the maxNumberMatchingCharsList
-            for j in numberMatchingCharsList:
-                # select so most suitable
-                if j[1] > longestMatch:  # [1] is maxNumberMatchingChars
-                    longestMatch = j[1]
-                    charPosition = j[0]  # [0] is charPosition
-                # there can be best results for the same time
-                # that is ignored at this point
-                # only one status message is issued
-                # the second, equally matching value, is not selected
-                if j[1] == longestMatch:
-                    # this may be due to the fact that equal values are compared
-                    # duplicates exist in the list of already classified occupational data
-                    # therefore values to be compared can be the same
-                    if hitlist[j[0]][0] == hitlist[charPosition][0]:
-                        print("Status: A duplicate exists in the list:", hitlist[j[0]][0], hitlist[charPosition][0])
-                        continue;
-                    # but the values do not always have to be the same, they can also just have the same beginning
-                    print("Status: Two very similar values exist in the list:", hitlist[j[0]], longestMatch,
-                          relevantObject)
-
-            # overwrite the relevant variables
-            bestFitLemma = hitlist[charPosition][0]
-            cacheCounter = hitlist[charPosition][1]
-    except UnboundLocalError:
-        pass;
-
-    # alternative, if the possibility above did not lead to success
-    # this may be due to the fact that abbreviations are included
-    if minimalDistRel >= 0.25:
-        # search for abbreviations marked with a dot
-        for counter, i in enumerate(existingVariantsKldB):
-            designationCopy = relevantObject.lower()
-            originalDesignation = copy.copy(i[key]).lower()  # copy important because it is changed afterwards
-            # only if first letters are equal (runtime improvement)
-            if originalDesignation[:1] == designationCopy[:1]:
-                # abbreviation handling
-                preDesignationCopy = designationCopy # save previous value
-                designationCopy = abbreviationsCorrector(designationCopy, originalDesignation)
-                if designationCopy == preDesignationCopy:
-                    # the same again the other way around
-                    originalDesignation = abbreviationsCorrector(originalDesignation, designationCopy)
-                levDist = Levenshtein.distance(originalDesignation, designationCopy)
-                if levDist < minimalDistRel:
-                    minimalDistRel = levDist
-                    # if the new value is smaller, then overwrite relevant variables
-                    bestFitLemma = i[key]
-                    cacheCounter = counter
-                # at least one hit
-                minOneFound = 1
-
-    if minOneFound == 0:
-        bestFitLemma = "nothing"  # occurs, if e.g. the first letter is a colon; there is no variant to
-        cacheCounter = -1
-    # merge information
-    levenDict = {
-        "lemma row": cacheCounter,
-        "variant": relevantObject,
-        "best fit lemma": bestFitLemma,
-        "absolute distance": minimalDistAbs,
-        "relative distance": minimalDistRel
-    }
-    return (levenDict)
-
-
-def occucleaner(occu, existingVariantsKldB):
-    """
-    This function cleans up individual occupation information.
-    It is also essential that various information is separated from the original job title.
-    This can concern several job titles, but also non-professional information.
-    :param occu: occupational title
-    :param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
-    :return: information about the different occupational indications in the original indication (dictionary)
-    """
-
-    # storage of the original occupational title
-    originalOccu = occu
-
-    # Initialisierung
-    # "occu1" does not need to be initialized because there is at least one occupation specification
-    occu2 = ""
-    occu3 = ""
-    occu4 = ""
-    occu5 = ""
-
-    # Initialisierung
-    part1 = ""
-    part2 = ""
-    part3 = ""
-    part4 = ""
-    part5 = ""
-
-    # general preprocessing
-
-    # step 1: Remove spaces at the beginning and end
-    occu = occu.strip()
-
-    # step 2: Write out abbreviations
-    if "mstr." in occu:
-        occu = occu.replace("mstr.", "meister")
-    if "Ing." in occu:
-        occu = occu.replace("Ing.", "Ingenieur")
-
-    # step 3: Normalize separation operators
-    occu = separator(occu, " u.", " und")
-    occu = separator(occu, "+", " und ")  # there are also "und" (and) without spaces
-    occu = separator(occu, ", ", " und ")
-    occu = separator(occu, ",", " und ")
-    occu = separator(occu, "; ", " und ")
-    occu = separator(occu, " & ", " und ")
-    occu = separator(occu, " / ", " und ")
-    occu = separator(occu, "/", " und ")
-
-    # detail processing
-
-    # separate multiple occupations
-    partList = [part1, part2, part3, part4, part5]  # parts are still all empty here
-    partCounter = 0
-    trennoperator = " und "
-    partList[0] = occu  # is needed for initialization because the while loop accesses the next one
-    # < 4, because not infinite parts should be made
-    while trennoperator in partList[partCounter] and partCounter < 4:
-        st = partList[partCounter]
-        partList[partCounter] = st[:st.find(" und ")]
-        partList[partCounter + 1] = st[(st.find(" und ") + len(" und ")):]
-        partCounter = partCounter + 1
-
-    # Werte aus Party zurückschreiben
-    part1 = partList[0]
-    part2 = partList[1]
-    part3 = partList[2]
-    part4 = partList[3]
-    part5 = partList[4]
-
-    if partCounter == 0:  # wenn es nur einen Teil gibt
-        part1 = occu
-
-    # dict adden zum dict bezeichnungen
-    occu1 = partCorrector(part1, existingVariantsKldB)
-    if part2 != "":  # wenn es kein part2 gibt, dann einfach weiter
-        occu2 = partCorrector(part2, existingVariantsKldB)
-        if part3 != "":  # kann nur sein, wenn Part 2 vorher auch war
-            occu3 = partCorrector(part3, existingVariantsKldB)
-            if part4 != "":
-                occu4 = partCorrector(part4, existingVariantsKldB)
-                if part5 != "":
-                    occu5 = partCorrector(part5, existingVariantsKldB)
-    # weitere parts hinzufügen
-
-    # information about the different occupational indications in the original indication
-    bezeichnung = {
-        "variant": originalOccu,
-        "occupation 1": occu1,  # occu1 ist ein dict (beruf)
-        "occupation 2": occu2,
-        "occupation 3": occu3,
-        "occupation 4": occu4,
-        "occupation 5": occu5
-    }
-
-    return (bezeichnung)
-
-
-def statistics(occuList, occuKeys):
-    """
-    This function counts the number of lemmatizations over the different process branches.
-    :param occuList: list of dictionaries with information to analysed occupational information
-    :param occuKeys: column headings for the analysis of separated occupations
-    """
-    #
-    counter = 0  # found directly in existing variants
-    counter0 = 0  # empty occupational designations (only came about as a result of cleanup, e.g. because only location information was given)
-    counter2 = 0  # found by Levenshtein distance
-    counter3 = 0  # could not be found
-    counter4 = 0  # found by Levenshtein distance NV
-    counter5 = 0  # found directly in existing variants NV
-
-    for i in occuList:
-        # iterate all the occupations sparred in it
-        for j in i:
-            # iterate the five possible keys ("occupation 1", ...)
-            for key in occuKeys:
-                # if the entry for the key does not contain any content, skip it
-                if j[key] == "":
-                    continue;
-                elif j[key]["selection info"] == "found direct":
-                    counter = counter + j["number"]
-                elif j[key]["selection info"] == "found after levenshtein":
-                    counter2 = counter2 + j["number"]
-                elif j[key]["selection info"] == "not found":
-                    counter3 = counter3 + j["number"]
-                elif j[key]["selection info"] == "no occupational designation":
-                    counter0 = counter0 + j["number"]
-                elif j[key]["selection info"] == "found after levenshtein NV":
-                    counter4 = counter4 + j["number"]
-                elif j[key]["selection info"] == "found direct NV":
-                    counter5 = counter5 + j["number"]
-                else:
-                    print("Error: Selection information is missing.")
-
-    # output of statistical information
-    counterSum = counter0 + counter + counter2 + counter3 + counter4 + counter5
-    print("Status: Proportion of adjusted occupations found directly in the variants:", counter / counterSum,
-          counter)
-    print("Status: proportion of adjusted occupations found directly in the variants NV:",
-          counter5 / counterSum,
-          counter5)
-    print("Share ... Levensthein distance:", counter2 / counterSum, counter2)
-    print("Share ... Levensthein distance NV:", counter4 / counterSum, counter4)
-    print("Share ... not found", counter3 / counterSum, counter3)
-    print("Share of empty job titles (through cleanup)", counter0 / counterSum, counter0)
-
-
-def preCreateOccuList(filename, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                      filenameVariants, filenameDistance, filenameOccu, occuKeys):
-    """
-    This function creates a list of location information available in a GEDCOM file.
-    :param filename: designation of the file (string)
-    :param existingVariantsKldB: data on the already classified occupation information
-    :param fieldnamesVariants: column headings of the newVariants.csv file
-    :param fieldnamesDistance: column headings of the levenshteinDistance.csv
-    :param fieldnamesOccu: column headings of the occuResult.csv file
-    :param filenameVariants: path and name of the newVariants.csv file
-    :param filenameDistance: path and name of the levenshteinDistance.csv
-    :param filenameOccu: path and name of the occuResult.csv file
-    :param occuKeys: keys for the separated professions
-    :return: list with location information
-    """
-    # a loop with one pass is necessary to be able to formulate a termination condition
-    for start in range(1):
-        # saving the name of the parallelization process
-        spawnPoolWorker = current_process().name
-
-        # loading data of a GEDCOM file
-        data = loadGedcomFile(filename)
-
-        # status information
-        print(spawnPoolWorker, "Status: The analysis of the occupational data for file", filename, "begins.")
-
-        # list of all occupations in one source
-        allOccupationsInSource = []
-
-        # iteration of each line in the GEDCOM file
-        for counter, i in enumerate(data):
-            # continue if OCCU tag is present
-            if i[2:6] == "OCCU":
-                occupation = i[7:]
-                # some files have the anomaly that the OCCU tag is empty, but the profession information is in the PLAC tag below it
-                # if this is the case, the information of the next line should be used
-                if occupation == "":
-                    occupation = data[counter + 1][7:]
-                    allOccupationsInSource.append(occupation)
-
-    # function must be executed iteratively, because otherwise it is called via parallelization
-    occuList = []
-    for i in allOccupationsInSource:
-        occuList.append(createOccuList(i, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                                       filenameVariants, filenameDistance, filenameOccu, occuKeys, filename)[
-                            0])  # "[0]" at the end is necessary because the function returns a list
-    return (occuList)
-
-
-def createOccuList(phrase, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
-                   filenameVariants, filenameDistance, filenameOccu, occuKeys, filename):
-    """
-    This function creates a list of location information available in a source.
-    :param filename: designation of the file
-    :param existingVariantsKldB: data on the already classified occupation information
-    :param fieldnamesVariants: column headings of the newVariants.csv file
-    :param fieldnamesDistance: column headings of the levenshteinDistance.csv
-    :param fieldnamesOccu: column headings of the occuResult.csv file
-    :param filenameVariants: path and name of the newVariants.csv file
-    :param filenameDistance: path and name of the levenshteinDistance.csv
-    :param filenameOccu: path and name of the occuResult.csv file
-    :param occuKeys: keys for the separated professions
-    :param filename: designation of the file (string)
-    :return: list with location information
-    """
-    # loading data of new variants
-    # this is necessary every time, because an identical job title can occur in one of the parallel processes
-    newVariants = loadData(filenameVariants, "\t", "latin1")
-
-    # a loop with one pass is necessary to be able to formulate a termination condition
-    for start in range(1):
-        # create a list with information about the new variants
-        occuList = []  # list of unadjusted variants in the source (list entries are dictionaries with a lot of information)
-        designationList = []  # list of adjusted variants in the source
-
-        # if the variant has already been edited, it should not be edited again
-        # however, a counter should then be incremented, which documents the number
-        if phrase in designationList:  # comparison with already processed variants
-            # search for the entry in the occuList that matches the variant
-            for j in occuList:
-                if j["variant"] == phrase:
-                    # count up number
-                    j["number"] = j["number"] + 1
-            # skip processing
-            continue;
-        # if the occupation information has not yet been processed, then this should be done as follows
-        else:  # occupation statement for the first time in this source
-            # extension of the list of processed designations
-            designationList.append(phrase)
-            # variant cleanup
-            resultOccucleaner = occucleaner(phrase, existingVariantsKldB)
-            # completing the file name and setting the occurrence to 1
-            resultOccucleaner.update({"source": filename})
-            resultOccucleaner.update({"number": 1})
-            # adding to the occuList in a dictionary
-            occuList.append(resultOccucleaner)
-
-        # if the selection was made on the basis of the Levenshtein distance, this information should be saved
-        # two lists are created for this purpose
-        levenList = []  # list is used to create the content for a new row in newVariants.csv
-        levenList2 = []  # list is used to create the content for a new line in "levenshteindistance.csv
-
-        # Iteration per occupation specification in the source
-        for i in occuList:
-            # check all five possible separated professions
-            for key in occuKeys:
-                # if entry for the key is not filled in, then skip it
-                if i[key] == "":
-                    continue;
-                if i[key]["selection info"] == "found after levenshtein":
-                    newDict = {
-                        "variant": i[key]["occupation"],
-                        "lemma": existingVariantsKldB[i[key]["row of best fit lemma"]]["Variante"],
-                        "OhdAB_01": i[key]["KldB 2010"]
-                    }
-                    levenList.append(newDict)
-                elif i[key]["selection info"] == "found after levenshtein NV":
-                    newDict = {
-                        "variant": i[key]["occupation"],
-                        "lemma": newVariants[i[key]["row of best fit lemma"]]["Variante"],
-                        "OhdAB_01": i[key]["KldB 2010"]
-                    }
-                    levenList.append(newDict)
-                if i[key]["similarity analysis"] != "":  # for levenshteindistance.csv
-                    levenList2.append(i[key]["similarity analysis"])
-
-        # blocked printing of new lines in the files
-        # all files should be at the same level
-        # so if an error occurs with a variable, all files are not written to
-        try:
-            appendFile(filenameOccu, occuList, fieldnamesOccu)
-            appendFile(filenameVariants, levenList, fieldnamesVariants)
-            appendFile(filenameDistance, levenList2, fieldnamesDistance)
-        except:
-            print(
-                "Error: Blocked printing of the lines failed. Manual deletion of the entries of the last file appropriate.")
-    return (occuList)
-
-
-if __name__ == '__main__':
-    # part up to 'parallelization' is executed once at the beginning
-
-    # storage of the time at the beginning of the program run
-    starttime = time.perf_counter()
-    # loading the sources (exemplary here: GEDCOM files from GEDBAS)
-    # definition of the range in which the file names are located (e.g. 1.ged to 60000.ged)
-    begin = 0
-    end = 60000
-    # creation of a list with the possible file names
-    gedcomNamesList = []
-    while begin != end:
-        datename = str(begin) + ".ged"
-        gedcomNamesList.append(datename)
-        begin = begin + 1
-    # check if the files exist
-    # exclude non-existent files
-    gedcomNamesListClear = []
-    for i in gedcomNamesList:
-        # files are located in the 'data' subfolder
-        filepath = os.path.join("data", i)
-        try:
-            # if opening works, the file exists and is added to a new list
-            gedcom = open(filepath, "r", encoding="utf-8")
-            gedcom.close()
-            gedcomNamesListClear.append(i)
-        except FileNotFoundError:
-            pass
-
-    # open more context data
-    # data from the Historical Data Center of Saxony-Anhalt
-    # classification based on the Klassifikation der Berufe (KldB, Classification of Professions)
-    # data from another classification system can also be used here
-    # file contains already classified occupational variants
-    filename = os.path.join("data", "varianten_goldberg.csv")
-    fieldnames = ["id_variante",  # unique ID of the occupational variant
-                  "variant",  # textual representation of the variant
-                  "Erstberuf",  # textual standardization of the professional title
-                  "OhdAB_01",  # code of the OhdAB
-                  "Berufsrolle",  # professional role (e.g. officer, leader)
-                  "Geschlecht",  # sex
-                  "Familienrolle",  # family role (e.g. daughter)
-                  "PrimaryFirst",  # ??? todo
-                  "MatchSequence",  # ??? todo
-                  "filter_$"  # ??? todo
-                  ]
-    # loading data from existing file
-    # if no file exists, a new one is created
-    existingVariantsKldB = createFile(filename, fieldnames, ";", "latin1")
-
-    # status message on the number of existing variants
-    print("Status:", len(existingVariantsKldB), "classified variants already exist.")
-
-    # if halving of variants is to be done for testing purposes, set halving to "yes"
-    halving = "no"
-    # deletion of every second already classified occupation information
-    if halving == "yes":
-        remainingVariantsKldB = []
-        for zahl, i in enumerate(existingVariantsKldB):
-            if zahl % 2 == 0:
-                remainingVariantsKldB.append(i)
-        print("Status: There has been a halving of the variants for testing purposes.", len(remainingVariantsKldB),
-              "variants remain.")
-        # overwrite the variable of all variants
-        existingVariantsKldB = remainingVariantsKldB
-
-    # create file for saving the newly classified files
-    filenameVariants = os.path.join("data", "newVariants.csv")
-    fieldnamesVariants = ["variant",  # designation of the new variant of an occupation
-                          "lemma",  # existing designation of an occupation to which the new variant is assigned
-                          "OhdAB_01"  # code according to KldB
-                          ]
-    createFile(filenameVariants, fieldnamesVariants, "\t", "latin1")
-
-    # list about the best hits for each checked job title
-    filenameDistance = "levenshteinDistance.csv"
-    fieldnamesDistance = ["relative distance",  # absolute Levenshtein distance divided by the length of the variant
-                          "absolute distance",  # absolute Levenshtein distance
-                          "variant",  # designation of the new variant of an occupation
-                          "best fit lemma",  # designation of the best fitting existing variant
-                          "selection",  # binary information whether the lemma was selected (1 means yes, 0 means no)
-                          "lemma row"  # number of the line in the existing variants
-                          ]
-    createFile(filenameDistance, fieldnamesDistance, "\t", "latin1")
-
-    # list for dividing the different components of a job specification
-    filenameOccu = "occuResult.csv"
-    fieldnamesOccu = ["variant",  # designation of the new variant of an occupation
-                      "source",  # name of the file in which the variant occurs (source)
-                      "number",  # Number of occurrences of the variant in the source
-                      "occupation 1",  # information about the first occupation found
-                      "occupation 2",  # information about the second occupation found
-                      "occupation 3",  # information about the third occupation found
-                      "occupation 4",  # information about the fourth occupation found
-                      "occupation 5"  # information about the fifth occupation found
-                      ]
-    createFile(filenameOccu, fieldnamesOccu, "\t", "latin1")
-
-    # definition of the keys for the separated professions
-    occuKeys = ["occupation 1", "occupation 2", "occupation 3", "occupation 4", "occupation 5"]
-
-    # initialization of a list in which the results of the upcoming parallelized process are stored
-    # this will process a list of occupation details in parallel
-    # the result is a list of dictionaries containing different information about the analysis (occuList)
-    occuList = []
-
-    # parallelization
-    pool = Pool(3)
-    occuList = pool.map(partial(preCreateOccuList,
-                                existingVariantsKldB=existingVariantsKldB,
-                                fieldnamesVariants=fieldnamesVariants,
-                                fieldnamesDistance=fieldnamesDistance,
-                                fieldnamesOccu=fieldnamesOccu,
-                                filenameVariants=filenameVariants,
-                                filenameDistance=filenameDistance,
-                                filenameOccu=filenameOccu,
-                                occuKeys=occuKeys), gedcomNamesListClear)
-    pool.close()
-    pool.join()
-
-    # second processing loop for the designations that are not found but have components
-    # Example: "farmer and craftsman" is not found, but "farmer" and "craftsman" are found individually
-
-    # second processing
-    gedcomNamesListClear2 = []
-    # iterate all original occupation information
-    for i in occuList:
-        # iterate all the occupations sparred in it
-        for j in i:
-            # iterate the five possible keys ("occupation 1", ...)
-            for key in occuKeys:
-                # if the entry for the key does not contain any content, skip it
-                if j[key] == "":
-                    continue;
-                # only professions that are "not found"
-                if j[key]["selection info"] == "not found":
-                    gedcomNamesListClear2.append(j["occupation 1"]["occupation"])
-
-    # parallelization
-    pool = Pool(3)
-    occuList2 = pool.map(partial(createOccuList,
-                                 existingVariantsKldB=existingVariantsKldB,
-                                 fieldnamesVariants=fieldnamesVariants,
-                                 fieldnamesDistance=fieldnamesDistance,
-                                 fieldnamesOccu=fieldnamesOccu,
-                                 filenameVariants=filenameVariants,
-                                 filenameDistance=filenameDistance,
-                                 filenameOccu=filenameOccu,
-                                 occuKeys=occuKeys,
-                                 filename=""), gedcomNamesListClear2)
-    pool.close()
-    pool.join()
-
-    # the same again for a third iteration
-
-    # third processing
-    gedcomNamesListClear3 = []
-    # iterate all original occupation information
-    for i in occuList2:
-        # iterate all the occupations sparred in it
-        for j in i:
-            # iterate the five possible keys ("occupation 1", ...)
-            for key in occuKeys:
-                # if the entry for the key does not contain any content, skip it
-                if j[key] == "":
-                    continue;
-                # only professions that are "not found"
-                if j[key]["selection info"] == "not found":
-                    gedcomNamesListClear3.append(j[key]["occupation"])
-
-    # parallelization
-    pool = Pool(3)
-    occuList3 = pool.map(partial(createOccuList,
-                                 existingVariantsKldB=existingVariantsKldB,
-                                 fieldnamesVariants=fieldnamesVariants,
-                                 fieldnamesDistance=fieldnamesDistance,
-                                 fieldnamesOccu=fieldnamesOccu,
-                                 filenameVariants=filenameVariants,
-                                 filenameDistance=filenameDistance,
-                                 filenameOccu=filenameOccu,
-                                 occuKeys=occuKeys,
-                                 filename=""), gedcomNamesListClear3)
-    pool.close()
-    pool.join()
-
-    # creation of statistics for the three iterations
-    statistics(occuList, occuKeys)
-    statistics(occuList2, occuKeys)
-    statistics(occuList3, occuKeys)
-
-    # storage of the time at the ending of the program run
-    finishtime = time.perf_counter()
-
-    # status info
-    print("Status: Program finished in", round(finishtime - starttime, 2), "seconds(s)")
diff --git a/2022_005_goldberg/Skripte/placefinder.py b/2022_005_goldberg/Skripte/placefinder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1260b6c37fe4e3bc8ecaf1ec37f4ae9b0955b8d5
--- /dev/null
+++ b/2022_005_goldberg/Skripte/placefinder.py
@@ -0,0 +1,693 @@
+from Levenshtein import distance
+from haversine import haversine
+import copy
+import qualitychecker
+
+
+def placeFinder(locNameClean, miniGOV, gedcomMetaInfos, bannedObjectTypes):
+    """
+    This function prepares urban names for identification.
+    :param locNameClean: name of place after cleansing
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param gedcomMetaInfos: content for one line of the file "quality.csv"
+    :param bannedObjectTypes: list of banned object types
+    :return: list of GOV identifier, longitude, latitude, and information about identification process
+    """
+    # searching in with row of gedcomMetaInfos the information to the actual file is in
+    clusterMeanList = gedcomMetaInfos["cluster midpoints"]
+
+    # initiate find() to return some values of parameter of the seltected object
+    resultOfFind = find(miniGOV, locNameClean, clusterMeanList, bannedObjectTypes)
+    positionMiniGOV = resultOfFind[0]  # number of row of selected object in Mini-GOV
+    selectInfo = resultOfFind[1]  # information about the way of selecting/identifiying object
+    # if find() was not successfull then selectedRowOfMiniGOV is -1 and parameter should named with "NONE"
+    if positionMiniGOV != -1:
+        govid = miniGOV[positionMiniGOV]["GOV-Kennung"]  # ID of GOV object
+        longitude = miniGOV[positionMiniGOV]["geographische Länge"]  # longitude
+        latitude = miniGOV[positionMiniGOV]["geographische Breite"]  # latitude
+        return [govid, longitude, latitude, selectInfo]
+    govid = "NONE"
+    longitude = "NONE"
+    latitude = "NONE"
+    return [govid, longitude, latitude, selectInfo]
+
+
+def find(miniGOV, locNameClean, clusterMeanList, bannedObjectTypes):
+    """
+    This function identifies an adjusted urbanonym.
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param locNameClean: name of place/urbanonym after cleansing
+    :param clusterMeanList: list of means of coordinates for the clusters in a source
+    :param bannedObjectTypes: list of banned object types
+    :return: index of the line in "miniGOV" of the identified location and information about the type of identification
+    """
+    # headline of the column with the relevant information (acutal name) ob objects
+    keyMiniGOV = "aktueller Name"
+
+    # banned characteristics of value
+    if locNameClean == "unrealisticSequenceOfStringsAlpha":
+        selectInfo = "Not selected on the basis of prohibited content."
+        return ([-1, selectInfo])
+    elif locNameClean == "unrealisticSequenceOfStringsBeta":
+        selectInfo = "Not selected based on prohibited specification."
+        return ([-1, selectInfo])
+
+    # define rank order of some types (priority)
+    # if there are several matches, it is more likely to be a "Stadt" (more urban) than a "Ort" (more rural)
+    orderRankObjectTypes = ["Kreisfreie Stadt",
+                            "Stadt",
+                            "Dorf",
+                            "Pfarrdorf",
+                            "Ort",
+                            "Ortsteil",
+                            "Ortschaft",
+                            "Wohnplatz",
+                            "Weiler"]
+
+    # cleaned location data can contain several urbanonyms (z. e.g. places in brackets, hyphen as separation)
+    # these must be checked individually if one of them cannot be identified
+    # "valueList" contains all sub-urbanonyms to be examined and starts with the entire cleaned-up designation
+    valueList = [locNameClean]
+    # search for hyphens
+    if "-" in locNameClean:
+        positionMinus = locNameClean.find("-")
+        # first add what comes before the hyphen
+        valueList.append(locNameClean[:positionMinus])
+        valueList.append(locNameClean[positionMinus + 1:])
+    # search for brackets
+    if "(" in locNameClean and ")" in locNameClean:
+        positionBreakedOpen = locNameClean.find("(")
+        positionBreakedClose = locNameClean.find(")")
+        # first add what is not in brackets
+        # assumption: the brackets are not in front and have a space before (therefore -1)
+        valueList.append(locNameClean[:positionBreakedOpen - 1])
+        valueList.append(locNameClean[positionBreakedOpen + 1:positionBreakedClose])
+    # if no indication of it can be identified, then check the part up to the first space
+    if " " in locNameClean:
+        positionSpace = locNameClean.find(" ")
+        valueList.append(locNameClean[:positionSpace])
+
+    # testing of the different urbanonym components
+    # if anything can be identified, the loop is terminated and not all loop passes are needed
+    for counter, newLocValueClean in enumerate(valueList):
+        # first, the unadjusted urbanonym checks the information before the comma
+        # binary search algorithm begins here
+        position = int(len(miniGOV) / 2)  # start in the middle of the Mini-GOV
+        # initial not 0, because otherwise it leads to complex numbers
+        furtherPosition = len(miniGOV)
+        # execute loop until the new position is only 10 lines away from the old one
+        while (furtherPosition - position) not in range(-10, 10):
+            positionCache = position
+            # designation from the Mini-GOV must be converted to lower case
+            if newLocValueClean > miniGOV[position][keyMiniGOV].lower():
+                # amount of difference between "furtherPosition" and "position" / 2
+                position = position + int(abs(furtherPosition - position) / 2)
+            elif newLocValueClean < miniGOV[position][keyMiniGOV].lower():
+                # amount of difference between "furtherPosition" and "position" / 2
+                position = position - int(abs(furtherPosition - position) / 2)
+            elif newLocValueClean == miniGOV[position][keyMiniGOV].lower():
+                break;  # runtime improvement, it cannot be more precise
+            furtherPosition = positionCache
+        # search for a match 30 lines before and after the found position; find this positions
+        # looks at the 30 in front and behind, if there are e.g. 60 places with the same name
+        try:
+            miniGOV[position - 30][keyMiniGOV]
+            startPosition = position - 30
+        except IndexError:  # the error occurs when the number is in front
+            startPosition = 0
+        try:
+            miniGOV[position + 30][keyMiniGOV]
+            endPosition = position + 30
+        except IndexError:  # the error occurs when the number is behind
+            endPosition = len(miniGOV)
+
+        # initialising of lists
+        equalList = []  # values that are equal
+        similarList = []  # values that are similar
+
+        # similarity analysis
+        if newLocValueClean != "":  # similarity analysis makes sense, if "newLocValueClean" is not empty
+            # creation of a list with the results
+            simularityList = []
+            # check each position 30 lines before and after the previously found line
+            for i in range(startPosition, endPosition):
+                # use of the levenshtein distance for equality checks
+                levenshteinDistance = distance(miniGOV[i][keyMiniGOV].lower(), newLocValueClean)
+                simularityList.append([i,  # index (acutal position in Mini-GOV)
+                                       levenshteinDistance,  # absolute levensthein distance
+                                       levenshteinDistance / len(newLocValueClean),  # relative levenshtein distance
+                                       miniGOV[i][keyMiniGOV].lower(),  # comparative Mini-GOV designation
+                                       newLocValueClean  # comparative urbanonym
+                                       ])
+            # search for hits where the Levenshtein Distance was 0 (equality)
+            for i in simularityList:
+                # if levenshteinDistance is 0 then both strings are the same
+                position = i[0]
+                levenshteinDistance = i[1]
+                if levenshteinDistance == 0:
+                    equalList.append(position)  # equalList contains only line numbers
+            # if there is none with the levenshteinDistance 0, then check if there are hits with an relative levvenshtein distance of 0.17
+            if len(equalList) == 0:
+                for i in simularityList:
+                    if i[2] <= 0.17:
+                        similarList.append(i[0])  # similarList contains only line numbers
+
+        # check length of equalList and similarList
+        # "equalList" has a priority over "similarList"
+        # "selectInfo" explains if and how an identification takes place
+        if len(equalList) == 0:
+            # no same hit but exactly one similar hit
+            if len(similarList) == 1:
+                # even if there is only one hit, it must not have a banned object type
+                if miniGOV[similarList[0]]["Objekttyp als Zahl"] in bannedObjectTypes:
+                    selectInfo = "Not selected because nothing was found in the Mini-GOV (with similarity analysis)"
+                    return ([-1, selectInfo])
+                else:
+                    selectInfo = "Selected based on a single matching hit in the similarity analysis"
+                    return ([similarList[0], selectInfo])
+            # no same hit but more then one similar hit
+            elif len(similarList) > 1:
+                # start a selection
+                resultAreaSearch = areaSearch(similarList,
+                                              "(with similarity analysis)",
+                                              miniGOV,
+                                              clusterMeanList,
+                                              bannedObjectTypes,
+                                              orderRankObjectTypes)
+                return (resultAreaSearch)  # return value has the same structure as the previous
+            # no equal or similar hit
+            # should still be able to do the next cycle and will only return a value if the last element of the "valueList" was checked
+            elif len(similarList) == 0 and (counter + 1) == len(valueList):
+                selectInfo = "Not selected because nothing was found in the Mini-GOV (with similarity analysis)"
+                return ([-1, selectInfo])
+        # exactly one hit in "equalList"
+        elif len(equalList) == 1:
+            selectInfo = "Selected based on a single matching hit"
+            return ([equalList[0], selectInfo])
+        # more then one hits in "equalList"
+        elif len(equalList) > 1:
+            resultAreaSearch = areaSearch(equalList,
+                                          "",
+                                          miniGOV,
+                                          clusterMeanList,
+                                          bannedObjectTypes,
+                                          orderRankObjectTypes)
+            return (resultAreaSearch)  # return value has the same structure as the previous
+
+    # if nothing is found until here, then return -1
+    selectInfo = "Nothing selected because nothing was found in the Mini-GOV"
+    return ([-1, selectInfo])
+
+
+def areaSearch(similarList, supplementSelectInfo, miniGOV, clusterMeanList, bannedObjectTypes, orderRankObjectTypes):
+    """
+    This function selects one of several possible locations.
+    The basis for this is the distance to the other identified locations in the source.
+    :param similarList: list of line numbers in the Mini-GOV that match the urbanonym
+    :param supplementSelectInfo: text that can be appended to "selectInfo
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param clusterMeanList: list of means of coordinates for the clusters in a source
+    :param bannedObjectTypes: list of banned object types
+    :param orderRankObjectTypes: list that defines rank order of some object types
+    :return: list of selected position and an information about the selection/identification process
+    """
+    # reading coordinates from the Mini-GOV and write them into coordList
+    coordList = []
+    for i in similarList:  # i is position in Mini-GOV
+        longitude = miniGOV[i]["geographische Länge"]
+        latitude = miniGOV[i]["geographische Breite"]
+        coordList.append([i, longitude, latitude])
+
+    # calculate similarity of values in "coordList"
+    geoDistanceList = []
+    for i in coordList:
+        # ignore entries without valid coordinates
+        if i[1] != "" and i[1] != "NONE" and i[2] != "" and i[2] != "NONE" and len(clusterMeanList) != 0:
+            # calculate a distance for each cluster center of the source and write it into a list
+            for j in clusterMeanList:  # clusterMeanList consists of "cluster midpoints"
+                # latitude coordinate 1, longitude coordinate 1, latitude coordinate 2, longitude coordinate 2
+                distance = haversine((float(i[2]), float(i[1])), (float(j[0]), float(j[1])))
+                geoDistanceList.append([i, distance])
+
+    # determination of the smallest distance
+    minimalDistance = 9999999.999  # some high initial value
+    for i in geoDistanceList:
+        newDistance = i[1]  # haversine in 1
+        if newDistance < minimalDistance:
+            minimalDistance = newDistance
+            positionMiniGOV = i[0][0]  # line number of the entry in the Mini-GOV that has the smallest distance
+
+    # only one value with coordinates remains
+    # not 1, but 1*cluster, because one is created for each cluster; inequality condition mandatory
+    if len(geoDistanceList) == 1 * len(clusterMeanList) and len(geoDistanceList) != 0:
+        selectInfo = "Selected because it was the only one with coordinates " + supplementSelectInfo
+    # several values remain, but the closest value is selected
+    elif len(geoDistanceList) > 1 * len(clusterMeanList):
+        selectInfo = "Selected on the basis of geographical proximity " + supplementSelectInfo
+    # no distance was determined
+    elif len(geoDistanceList) == 0:
+        # no one with geodistance there, but maybe I can exclude some others via the types
+        # creation of a list in which the unauthorized types are filtered out
+        noGeoDistButAllowedTypeList = []
+        for i in coordList:
+            position = i[0]
+            if miniGOV[position]["Objekttyp als Zahl"] not in bannedObjectTypes:
+                noGeoDistButAllowedTypeList.append(i)
+        # one object remains, chose this
+        if len(noGeoDistButAllowedTypeList) == 1:
+            selectInfo = "Selected based on the only valid type " + supplementSelectInfo
+            positionMiniGOV = noGeoDistButAllowedTypeList[0][0]
+        # no element is left over
+        elif len(noGeoDistButAllowedTypeList) == 0:
+            selectInfo = "None selected, because none has a valid type " + supplementSelectInfo
+            positionMiniGOV = -1  # must be described, because the variable has not yet been described
+        # several are left over
+        # selection via ranking order of the object types
+        else:
+            for objectTyp in orderRankObjectTypes:
+                # initialization of a list in which all elements of a type are written
+                objectTypeRankList = []
+                for elementCoordList in noGeoDistButAllowedTypeList:
+                    if miniGOV[elementCoordList[0]]["Objekttyp als Text"] == objectTyp:
+                        objectTypeRankList.append(elementCoordList[0])
+                    # one object remains, then select it
+                    if len(objectTypeRankList) == 1:
+                        positionMiniGOV = objectTypeRankList[0]
+                        selectInfo = "Selected on the basis of a suitable type " + supplementSelectInfo
+                        return ([positionMiniGOV, selectInfo])  # e. g. a city was found and preferred over a village
+                    # multiple hits, none can be selected
+                    elif len(objectTypeRankList) > 1:
+                        positionMiniGOV = -1
+                        selectInfo = "Not selected based on too many matching types " + supplementSelectInfo
+                        return ([positionMiniGOV, selectInfo])
+                    # if no hit, the loop is repeated with the next object type
+            # this part of the function is only executed if the identification has failed finally
+            selectInfo = "Not selected, because no heuristic gives a result " + supplementSelectInfo
+            positionMiniGOV = -1
+    return ([positionMiniGOV, selectInfo])
+
+
+def stringFunc1(behindTag, string):
+    """
+    This function removes strings from "behindTag".
+    :param behindTag: urbanonym
+    :param string: forbidden string
+    :return: urbanonym purged from the forbidden string
+    """
+    # if it is at the beginning, then take everything behind it, otherwise just delete
+    if string in behindTag:  # is not at the beginning
+        if behindTag.find(string) != 0:
+            position = behindTag.find(string)
+            behindTag = behindTag[:position]
+        else:  # is at the beginning
+            behindTag = behindTag.replace(string, "")
+    return (behindTag)
+
+
+def stringFunc2(behindTag, string):
+    """
+    This function is used to remove strings in "behindTag" if they are at the beginning.
+    :param behindTag: urbanonym
+    :param string: forbidden string
+    :return: urbanonym purged from the forbidden string
+    """
+    if string in behindTag:
+        if behindTag.find(string) == 0:
+            behindTag = behindTag.replace(string, " ")
+    return (behindTag)
+
+
+def dataCleaner(dataForCleansing):
+    """
+    This function is used to clean up an urbanoynm.
+    :param dataForCleansing: urbanonym (string)
+    :return: adjusted urbanonym (string)
+    """
+    # clean an urbanonym
+    behindTag = dataForCleansing  # data behind GEDCOM tag "PLAC" (the urbanoynm)
+    behindTag = behindTag.lower()  # behindTag lower cases for better cleansing
+    # cleansing of behindTag
+    # attention: order of cleansing operations is relevant
+    # definition of banned words
+    letters = ["a",
+               "b",
+               "c",
+               "d",
+               "e",
+               "f",
+               "g",
+               "h",
+               "i",
+               "j",
+               "k",
+               "l",
+               "m",
+               "n",
+               "o",
+               "p",
+               "w",
+               "r",
+               "s",
+               "t",
+               "u",
+               "v",
+               "w",
+               "x",
+               "y",
+               "z"
+               ]
+    # exclude the possibility that an abbreviation of a US state appears at the end
+    if behindTag[-4:-2] == ", " and behindTag[-2:-1] in letters and behindTag[-1:] in letters:
+        behindTag = "unrealisticSequenceOfStringsAlpha"
+    # definition of words that must not be included in the urbanonym
+    # banning abbreviations of staats is critial because thats are beginnigs of other places
+    for bannedWords in ["kanada",
+                        "canada",
+                        "america",
+                        "united states",
+                        " usa",
+                        "alabama",
+                        "alaska",
+                        "arizona",
+                        "arkansas",
+                        "california",
+                        "colorado",
+                        "connecticut",
+                        "delaware",
+                        "florida",
+                        "georgia",
+                        "hawaii",
+                        "idaho",
+                        "illinois",
+                        "indiana",
+                        "iowa",
+                        "kansas",
+                        "kentucky",
+                        "louisiana",
+                        "maine",
+                        "maryland",
+                        "massachusetts",
+                        "michigan",
+                        "minnesota",
+                        "mississippi",
+                        "missouri",
+                        "montana",
+                        "nebraska",
+                        "nevada",
+                        "new hapshire",
+                        "new jersey",
+                        "new york",
+                        "north carolina",
+                        "north dakota",
+                        "ohio",
+                        "oklahoma",
+                        "oregon",
+                        "pennsylvania",
+                        "rohde island",
+                        "south carolina",
+                        "south dakota",
+                        "tennessee",
+                        "texas",
+                        "utah",
+                        "vermont",
+                        "virginia",
+                        "washington",
+                        "west virginia",
+                        "wisconsin",
+                        "wyoming",
+                        "england",
+                        "united kingdom",
+                        "australia",
+                        "spain",
+                        "espagne",
+                        "glamorga",
+                        "russia",
+                        "luxembourg",
+                        "scotland",
+                        "irland",
+                        "norway",
+                        "griechenland",
+                        "turkey",
+                        "südafrika",
+                        "brasil",
+                        "france"]:
+        if bannedWords in behindTag:
+            behindTag = "unrealisticSequenceOfStringsAlpha"
+    # definition of words that must not be equal to the urbanonym
+    for bannedWords in ["germany",
+                        "poland",
+                        "france",
+                        "russland"]:  # ausschließlich das, nicht "enthält"
+        if bannedWords == behindTag:
+            behindTag = "unrealisticSequenceOfStringsBeta"
+
+    # if there is no space behind a dot, it should be added
+    if "." in behindTag:
+        position = behindTag.find(".")
+        if behindTag[position:position + 1] != " ":
+            behindTag = behindTag[:position] + " " + behindTag[position:]
+    # removal of defined strings
+    behindTag = behindTag.replace(">", "")  # remove ">"
+    behindTag = behindTag.replace("<", "")  # remove "<"
+    behindTag = behindTag.replace("_", "")  # remove "_"
+    behindTag = behindTag.replace("'", "")  # remove "'"
+    behindTag = behindTag.replace("rk.", "")  # remove "rk."
+    behindTag = behindTag.replace("ev.", "")  # remove "ev."
+    behindTag = behindTag.replace("waldfriedhof", "")  # remove "("waldfriedhof"
+    behindTag = behindTag.replace("friedhof", "")  # remove "friedhof"
+    behindTag = behindTag.replace("wahrscheinlich", "")  # remove "wahrscheinlich"
+    behindTag = behindTag.replace("aus ", "")  # remove "aus "
+    # remove numbers
+    behindTag = behindTag.replace("0", "")
+    behindTag = behindTag.replace("1", "")
+    behindTag = behindTag.replace("2", "")
+    behindTag = behindTag.replace("3", "")
+    behindTag = behindTag.replace("4", "")
+    behindTag = behindTag.replace("5", "")
+    behindTag = behindTag.replace("6", "")
+    behindTag = behindTag.replace("7", "")
+    behindTag = behindTag.replace("8", "")
+    behindTag = behindTag.replace("9", "")
+    # remove 7-bit ASCII
+    behindTag = behindTag.replace("\xa7", "ß")
+    behindTag = behindTag.replace("\x94", "ö")
+    behindTag = behindTag.replace("\x9a", "ö")
+    behindTag = behindTag.replace("\x8a", "ä")
+    behindTag = behindTag.replace("\x9f", "ü")
+    # removal of further special characters
+    behindTag = behindTag.replace("(?)", "")  # before removing "?", otherwise many problems with empty brackets
+    behindTag = behindTag.replace("?", "")  # often standing alone or behind places
+    behindTag = behindTag.replace(" -", "")  # only with spaces in front, not as hyphen
+
+    # definition of strings to be removed
+    stringFunc1List = ["standesamt ",
+                       "sta ",
+                       "ksp. ",
+                       "ksp ",
+                       "kirchspiel ",
+                       "kirche ",
+                       "pfarramt ",
+                       "ambt ",
+                       "oder ",
+                       "gemeinde ",
+                       "gmde. ",
+                       "gmde ",
+                       "pfarrei ",
+                       "gericht ",
+                       "ksp. "
+                       ]
+    for i in stringFunc1List:
+        behindTag = stringFunc1(behindTag, i)
+
+    # definition of strings to be deleted if they are at the beginning
+    stringFunc2List = [" bei ",
+                       " b. ",
+                       " in ",
+                       " im "
+                       ]
+    for i in stringFunc2List:
+        behindTag = stringFunc2(behindTag, i)
+
+    # writing out abbreviations
+    behindTag = behindTag.replace("berg. ", "bergisch ")  # Example: Bergisch Gladbach
+    behindTag = behindTag.replace("b. ", "bei ")  # Lichtenau b. Ansbach
+
+    # deletion of not needed content
+    if "jetzt" in behindTag:  # Example: Grone jetzt Göttingen
+        position = behindTag.find(" jetzt")
+        behindTag = behindTag[:position]
+    if "heute" in behindTag:  # Example:
+        position = behindTag.find(" heute")
+        behindTag = behindTag[:position]
+    if " um" in behindTag:  # Example: ... um 12 Uhr
+        position = behindTag.find(" um")
+        behindTag = behindTag[:position]
+    if " bei" in behindTag:  # Example: Lipke bei Landsberg
+        position = behindTag.find(" bei")
+        behindTag = behindTag[:position]
+    if " kr." in behindTag:  # Example: Bronn Kr. Mergentheim
+        position = behindTag.find(" kr.")
+        behindTag = behindTag[:position]
+    if " amt" in behindTag:
+        position = behindTag.find(" amt")
+        behindTag = behindTag[:position]
+    if "/" in behindTag:  # Example: Crossen/Oder
+        position = behindTag.find("/")
+        behindTag = behindTag[:position]
+    while behindTag[:1] == ",":  # delete preceding commas
+        behindTag = behindTag[1:]
+    if "," in behindTag:  # Example: Arendzhain, Kreis Luckau
+        position = behindTag.find(",")
+        behindTag = behindTag[:position]
+    if " in " in behindTag:  # Example: Taufe in Ogkeln
+        position = behindTag.find(" in ")
+        behindTag = behindTag[(position + len(" in ")):]
+
+    # eliminate double spaces
+    behindTag = behindTag.replace("  ", " ")
+    # eliminate spaces
+    behindTag = behindTag.strip(" ")
+
+    # overwrite return value
+    dataForCleansing = behindTag
+    return (dataForCleansing)
+
+
+def bannedObjects():
+    """
+    This function defines banned object types.
+    Banned object types are object types in the GOV that should not be used for identification.
+    Currently all ecclesiastical objects (up to and including 263) are banned.
+    Currently all legal objects (e.g. courts, from 263) are banned.
+    Currently administrative divisions outside Germany that make allocation difficult (from 257) are banned.
+    List of object types: http://gov.genealogy.net/type/list (retrieved on 8 December 2020)
+    Sometimes there is no English translation of the names of the object types.
+    :return: list of banned object types
+    """
+    return (["124",  # imperial abbey
+             "250",  # Apostolische Administratur
+             "6",  # diocese
+             "91",  # Bistumsregion
+             "9",  # deanery
+             "260",  # Delegaturbezirk
+             "11",  # diocese
+             "12",  # Dompfarrei
+             "13",  # filial church
+             "249",  # Erzbischöfliches Amt
+             "96",  # archbishopric
+             "219",  # Expositur
+             "245",  # chapel
+             "26",  # church
+             "210",  # Kirchenbund
+             "92",  # Kirchengemeinde
+             "27",  # Kirchenkreis
+             "28",  # Kirchenprovinz
+             "29",  # parish
+             "153",  # Kommissariat
+             "35",  # national church
+             "243",  # Propstei
+             "244",  # Nebenkirche
+             "245",  # chapel
+             "249",  # Erzbischöfliches Amt
+             "41",  # Pfarr-Rektorat
+             "42",  # parish
+             "155",  # region
+             "43",  # Pfarrkuratie
+             "44",  # Pfarrverband
+             "155",  # region
+             "206",  # selsoviet
+             "253",  # religious organization
+             "49",  # sprengel
+             "260",  # Delegaturbezirk
+             "263",  # Landratsbezirk
+             "151",  # Oberlandesgericht
+             "105",  # judicial
+             "3",  # Magistrates' Court
+             "223",  # Landgericht
+             "224",  # Pfleggericht
+             "228",  # Gerichtsamt
+             "19",  # Gerichtsbezirk
+             "70",  # bailiwick
+             "79",  # hundred
+             "114",  # Vest
+             "154",  # Honschaft
+             "202",  # Amtsgerichtsbezirk
+             "257",  # Landgemeinde PL
+             "264",  # Mairie
+             "135",  # canton
+             "134",  # arrondissement
+             "25"  # canton
+             ])
+
+
+def mainPlaceFinder(data, resultQualityChecker, filename, miniGov):
+    """
+    This function attempts to assign a GOV identifier to each location in a GEDCOM file.
+    :param data: content of one GEDCOM file
+    :param resultQualityChecker: content for one line of the file "quality.csv"
+    :param filename: name of the file/source
+    :param miniGov: list of merged entries of the Mini-GOV
+    :return: list of dictionaries, which contains the identification for each location
+    """
+    # copy the content to avoid compression
+    gedcomMetaInfo = resultQualityChecker
+
+    # definition of banned object types
+    # banned object types are object types in the GOV that should not be used for identification
+    # currently all ecclesiastical objects (up to and including 263), all legal objects (e.g. courts, from 263) and administrative divisions outside Germany that make allocation difficult (from 257)
+    # list of object types: http://gov.genealogy.net/type/list (retrieved on 8 December 2020)
+    # sometimes there is no English translation of the names of the object types
+    bannedObjectTypes = bannedObjects()
+
+    # "data" is compromised by the dataCleaner function and could no longer be used
+    # therefore a copy must be created that does not represent a pointer (that's why copy.copy is used)
+    initialGedcomData = copy.copy(data)
+    gedcomData = copy.copy(data)
+
+    # clean up every urbanonym in a GEDCOM file
+    # clean each row in gedcomData
+    for cleanCounter in range(len(gedcomData)):
+        resultParser = qualitychecker.gedcomRowParser(gedcomData, cleanCounter)  # seperate data of one row
+        tag = resultParser[2]  # GEDCOM tag
+        behindTag = resultParser[3]  # data behind GEDCOM tag
+        behindTag = behindTag.lower()  # behindTag lower cases for better cleansing
+        # for urbanonyms:
+        if tag == "PLAC":
+            dataCleaned = dataCleaner(behindTag)
+            # overwrite the original GEDCOM line with the cleaned text
+            gedcomData[cleanCounter] = resultParser[0] + " " + resultParser[2] + " " + dataCleaned
+
+    # creation of a list of locations and their sources
+    locList = []
+    for counter, i in enumerate(gedcomData):
+        if i[2:6] == "PLAC":
+            # adjusted urbanonym, original urbanonym, name of file
+            locList.append([i[7:], initialGedcomData[counter][7:], filename])
+
+    # delete duplicates in a source
+    locList = sorted(set(map(tuple, locList)), reverse=True)
+
+    # creation of a list containing the identifying data per urbanonym in a dictionary
+    resultList = []
+    for counter, i in enumerate(locList):
+        locNameClean = i[0]  # ubanonym with cleansing
+        locName = i[1]  # urbanoym without cleansing
+        fileName = i[2]
+        # find place
+        resultPlaceFinder = placeFinder(locNameClean,
+                                        miniGov,
+                                        gedcomMetaInfo,
+                                        bannedObjectTypes
+                                        )
+        # create dictionary
+        identifyingInfo = {
+            "id": resultPlaceFinder[0],
+            "latitude": resultPlaceFinder[1],
+            "longitude": resultPlaceFinder[2],
+            "selection information": resultPlaceFinder[3],
+            "adjusted name": locNameClean,
+            "original name": locName,
+            "filename": fileName
+        }
+        resultList.append(identifyingInfo)
+    return (resultList)
diff --git a/2022_005_goldberg/Skripte/provincefinder.py b/2022_005_goldberg/Skripte/provincefinder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1b8eb8d1c6320f74fe361f5d2b013d3356d32dc
--- /dev/null
+++ b/2022_005_goldberg/Skripte/provincefinder.py
@@ -0,0 +1,404 @@
+import julian
+import placefinder
+import time
+
+
+def provincesURI(time):
+    """
+    This function defines the URIs of different regions at different times.
+    :param time: year to which an administrative assignment should be made
+    :return: dictionary of GOV object URIs and the textual description of the respective province
+    """
+    # for times before 1872
+    if time <= 1871:
+        return ({
+            "object_190122": "A 01 Provinz Holstein",
+            # Problem in the GOV: The places are not linked to the historical offices, so the province is never found
+            "adm_131053": "A 02 Provinz Lauenburg",
+            "object_1081716": "A 03 Provinz Brandenburg (ohne Berlin)",
+            # if Berlin is meant, it will be recorded before
+            "object_190330": "A 04 Provinz Hessen-Nassau",
+            "object_268785": "A 05 Provinz Hohenzollern",
+            "object_284443": "A 05 Provinz Hohenzollern",
+            # Hohenzollern-Sigmaringen goes 1850 to Hohenzollerschen Landen
+            "adm_368500": "A 06 Provinz Ostpreußen",
+            "adm_368480": "A 07 Provinz Pommern",
+            "object_211667": "A 08 Provinz Posen",
+            "object_279654": "A 09 Provinz Sachsen",
+            "adm_368470": "A 10 Provinz Schlesien",
+            "object_190325": "A 11 Provinz Westfalen",
+            "object_213750": "A 12 Provinz Westpreußen",
+            "object_1047283": "A 13 Rheinprovinz",  # Provinz Jülich-Kleve-Berg until 1822
+            "object_405464": "A 13 Rheinprovinz",  # Provinz Großherzogtum Niederrhein until 1822
+            "object_190337": "A 13 Rheinprovinz",
+            "BERLINJO62PM": "A 14 Provinz Berlin",
+            "object_257607": "B 01 Amt Bergedorf",
+            "adm_369040": "B 02 Hansestadt Bremen",
+            "adm_369020": "B 03 Stadt Hamburg",
+            "LUBECKJO53IU": "B 04 Stadt Lübeck",
+            "adm_136412": "B 05 Stadt Frankfurt am Main",
+            "object_217406": "B 06 Fürstentum Lippe-Detmold",
+            "object_217818": "B 07 Fürstentum Schaumburg-Lippe",
+            "object_218152": "B 08 Fürstentum Waldeck-Pyrmont",
+            "object_352387": "B 09 Großherzogtum Oldenburg",
+            "object_217952": "B 10 Großherzogtum Baden",
+            "object_218147": "B 11 Hessen",
+            "object_217750": "B 12 Großherzogtum Mecklenburg-Schwerin",
+            "object_217749": "B 13 Großherzogtum Mecklenburg-Strelitz (einschließlich des Fürstentums Ratzeburg)",
+            "object_190873": "B 14 Herzogtum Anhalt",
+            "object_217954": "B 15 Herzogtum Braunschweig",
+            "object_218153": "B 16 Herzogtum Nassau",
+            "object_190098": "B 17 Herzogtum Schleswig",
+            "object_190729": "B 18 Königreich Württemberg",
+            "object_217953": "B 19 Königreich Bayern",
+            "object_190327": "B 20 Königreich Hannover",
+            "object_218149": "B 21 Königreich Sachsen",
+            "object_275299": "B 22 Kurfürstentum Hessen",  # here equated with Kurhessen
+            "object_284442": "B 23 Landgrafschaft Hessen-Homburg",
+            "": "B 24 Thüringische Staaten",  # is divided into many sub-states as follows
+            "object_218143": "B 24 Thüringische Staaten",  # Sachsen-Weimar-Eisenach
+            "object_284441": "B 24 Thüringische Staaten",  # Reuß Jüngere Linie
+            "object_218134": "B 24 Thüringische Staaten",  # Reuß Ältere Linie
+            "object_218137": "B 24 Thüringische Staaten",  # Sachsen-Altenburg
+            "object_218138": "B 24 Thüringische Staaten",  # Sachsen-Coburg-Gotha
+            "object_265487": "B 24 Thüringische Staaten",  # Sachsen Gotha
+            "object_218142": "B 24 Thüringische Staaten",  # Sachsen-Meiningen
+            "object_218150": "B 24 Thüringische Staaten",  # Schwarzburg-Rudolstadt
+            "object_218151": "B 24 Thüringische Staaten",  # Schwarzburg-Sondershausen
+            "object_218141": "B 24 Thüringische Staaten"  # Sachsen-Hildburghausen, has no subordinate objects
+        })
+    # for times after 1989
+    elif time >= 1990:
+        return ({
+            "BERLINJO62PM": "Land Berlin",
+            "object_218149": "Freistaat Sachsen",
+            "adm_369080": "Land Baden-Württemberg",
+            "adm_369090": "Freistaat Bayern",
+            "adm_369120": "Land Brandenburg",
+            "adm_369040": "Freie Hansestadt Bremen",
+            "object_1259992": "Freie und Hansestadt Hamburg",
+            "adm_369060": "Land Hessen",
+            "adm_369130": "Land Mecklenburg-Vorpommern",
+            "adm_369030": "Land Niedersachsen",
+            "adm_369050": "Land Nordrhein-Westfalen",
+            "adm_369070": "Land Rheinland-Pfalz",
+            "adm_369100": "Saarland",
+            "adm_369150": "Land Sachsen-Anhalt",
+            "adm_369010": "Land Schleswig-Holstein",
+            "adm_369160": "Freistaat Thüringen"
+        })
+
+
+def provinceFinder(govid, referenceYear, client):
+    """
+    This function determines the historical-administrative affiliation to an object at a given time.
+    :param govid: GOV identifier (string)
+    :param referenceYear: year to which an administrative assignment should be made
+    :param client: connection to the GOV-Webservice
+    :return: province or "None"
+    """
+    # if this variable is 1, the program will be stopped for one second in case of internet connection failures
+    # this prevents an abort of the program due to internet problems, but leads to a longer runtime
+    withSleeping = 0
+
+    # definition of prohibited object types
+    bannedObjectTypes = placefinder.bannedObjects()
+    # assignment of objects to be found and historical-administrative units
+    provinces = provincesURI(referenceYear)
+
+    # if GOV identifier empty, then return None
+    if govid == "":
+        return ("None")
+
+    govidBefore = 0  # Initialisierung
+    # following loop jumps one level up in the membership tree per iteration
+    # number of 10 is currently chosen arbitrarily, in the hope that no tree has more levels
+    for ab in range(0, 10):
+        # here, possible superordinate objects are included, which are appropriate in time
+        govidsList = []  # list A, priority
+        # List B (non priority) is required if no object fits so well that it is included in List A.
+        nonPrioGovidsList = []  # list B, non priority
+        # termination condition: if the same object is examined twice in a row, then abort
+        # query is used to improve the runtime, so that the same object is not searched max. 10 times
+        if govid == govidBefore:
+            print("Error: Object can no longer take a meaningful step (GOV-ID, GOV-ID before):", govid, govidBefore)
+            break
+        # since "govid" changes, the previous one must be cached
+        govidBefore = govid
+        # check if the object already matches a province
+        try:  # if yes, then there is no KeyError
+            province = provinces[govid]
+            return (province)
+        except KeyError:
+            # information about the object is obtained from the web service (a dictionary that is composed of dictionary)
+            if withSleeping == 1:
+                for run in range(1000):
+                    try:
+                        govidInfo = callWebservice(govid, client)
+                    except: # if the connection is just gone the program should not crash
+                        time.sleep(1)
+                        print("Status: Sleeping for 1 s.")
+                        if run == 999:
+                            print("Status: Connection error")
+            else:
+                govidInfo = callWebservice(govid, client)
+            # from this the entry "part-of" is required
+            govidInfoSuperior = govidInfo['part-of']
+            # if "part-of" is empty, then the info is in "located-in" if necessary
+            if len(govidInfoSuperior) == 0:
+                govidInfoSuperior = govidInfo["located-in"]
+
+            # every superior object is now searched
+            # The date can be in three places: 1. in timespan (), in begin-year, end-year, 3. in year
+            for superior in range(len(govidInfoSuperior)):
+                # if timespan available
+                # if timespan is not None, use the years from it
+                if govidInfoSuperior[superior]["timespan"] is not None:
+                    yearBegin = begincalculator(govidInfoSuperior[superior])
+                    yearEnd = endcalculator(govidInfoSuperior[superior])
+                    # check if the timespan matches the searched time
+                    # if yes a list is extended
+                    if yearBegin <= referenceYear and yearEnd >= referenceYear:
+                        govid = govidInfoSuperior[superior]["ref"]
+                        if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes:
+                            govidsList.append(govid)
+                    else:
+                        if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                            "value"] not in bannedObjectTypes:
+                            nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                # if timespan not available
+                else:
+                    try:
+                        # begin is determined
+                        if govidInfoSuperior[superior]["begin-year"] is None:
+                            yearBegin = 1  # sets begin to year 1
+                        else:
+                            yearBegin = govidInfoSuperior[superior]["begin-year"]
+                        # end is determined
+                        if govidInfoSuperior[superior]["end-year"] is None:
+                            yearEnd = 9999  # set end to year 9999
+                        else:
+                            yearEnd = govidInfoSuperior[superior]["end-year"]
+                        # if an object has an assumed time (start 1, end 9999), then always list B (problem otherwise e.g. with KIRORFJO40NS, adm_137138)
+                        if yearBegin == 1 or yearEnd == 9999:
+                            if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                                "value"] not in bannedObjectTypes:
+                                nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                        # comparison with reference time
+                        elif yearBegin <= referenceYear and yearEnd >= referenceYear:
+                            govid = govidInfoSuperior[superior]["ref"]
+                            if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes:
+                                govidsList.append(govid)
+                        else:
+                            if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                                "value"] not in bannedObjectTypes:
+                                nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                    except TypeError:
+                        print(
+                            "Error: A problem has occurred in the calculation of time spans. Presumably there are letters as numbers:",
+                            print(govidInfoSuperior[superior]))
+                        pass;
+            # if one of the objects in list A or B is one of the target objects, then take the
+            for i in govidsList:  # list A
+                try:
+                    province = provinces[i]
+                    return (province)  # Search was successful!
+                except KeyError:
+                    continue
+            for i in nonPrioGovidsList:  # list B
+                try:
+                    province = provinces[i]
+                    return (province)  # Search was successful!
+                except KeyError:
+                    continue
+            # if list A is empty, then list B should be used
+            if len(govidsList) == 0:
+                # if list B is also empty, then you should try to fill it further
+                if len(nonPrioGovidsList) == 0:  # Example: Case LIEHA2JO62RV, which has no part-of
+                    for a in range(len(govidInfoSuperior)):
+                        # the type of the following object is of interest (not the previous one)
+                        if callWebservice(govidInfoSuperior[a]["ref"], client)["type"][0][
+                            "value"] not in bannedObjectTypes:
+                            nonPrioGovidsList.append(govidInfoSuperior[a]["ref"])
+                govidsList = nonPrioGovidsList
+
+            # rate objects in list A or B
+
+            # delete duplicate values
+            # duplicate affiliations to the same object at different times may exist (e.g. adm_144024), but this is recognized below
+            govidsList = list(set(govidsList))
+
+            # if list contains only one object, then this is the appropriate one to perform the next iteration
+            if len(govidsList) == 1:
+                govid = govidsList[0]
+            # if list contains no object, then cancel
+            elif len(govidsList) == 0:
+                # mandatory abort, because no object could be determined to perform the next iteration
+                break;
+            else:  # case where list contains more than one value
+                closerInTime = []  # initialization
+                # each object in the list is checked to see how close the time limits are to the reference time
+                for elementGovidsList in govidsList:
+                    # a simple list comprehension to find the index is inappropriate, since the searched value can occur several times
+                    # therefore a list is created
+                    indexList = []  # results are stored in this list
+                    for counter, resultPartOf in enumerate(govidInfoSuperior):
+                        if resultPartOf["ref"] == elementGovidsList:
+                            indexList.append(counter)
+                    if len(indexList) == 0:
+                        index = None
+                        print("Error: The object name does not occur.")
+                    for index in indexList:
+                        if govidInfoSuperior[index][
+                            "timespan"] is not None:  # if timespan is given, then it is more detailed
+                            yearBegin = begincalculator(govidInfoSuperior[index])
+                            yearEnd = endcalculator(govidInfoSuperior[index])
+                        # if only one year, but no begin or end
+                        elif govidInfoSuperior[index]["begin-year"] is None and \
+                                govidInfoSuperior[index]["end-year"] is None and \
+                                govidInfoSuperior[index]["year"] is not None:
+                            yearBegin = govidInfoSuperior[index]["year"]
+                            yearEnd = govidInfoSuperior[index]["year"]
+                        else:  # if no timespan
+                            yearBegin = govidInfoSuperior[index]["begin-year"]
+                            if yearBegin is None:  # if there is no value
+                                yearBegin = 1
+                            yearEnd = govidInfoSuperior[index]["end-year"]
+                            if yearEnd is None:
+                                yearEnd = 9999
+                        diffBegin = abs(yearBegin - referenceYear)
+                        diffEnd = abs(yearEnd - referenceYear)
+                        clusterDict = {
+                            "object": elementGovidsList,
+                            "diffbegin": diffBegin,
+                            "diffend": diffEnd,
+                            "begin-year": yearBegin,
+                            "end-year": yearEnd
+                        }
+                        closerInTime.append(clusterDict)  # list of dictionaries
+                diff = 9999  # initialization
+                # In the following it is examined which of the chronologically obvious results is the closest in time.
+                # it is irrelevant whether the difference lies before or after the reference time
+                for counter, i in enumerate(closerInTime):
+                    # Equal comparisons are critical in cases where time limits overlap (e.g. object_289942 --> until 1920, since 1920)
+                    if int(i["diffbegin"]) < diff:
+                        diff = int(i["diffbegin"])
+                        closestInTime = counter
+                    elif int(i["diffbegin"]) == diff:
+                        # search the absolute value of the start (not the difference)
+                        yearBegin = i["begin-year"]
+                        # if reference period is smaller than diffbegin
+                        if referenceYear <= yearBegin:
+                            # if it is "begin" and the other "end", then take the one with the end
+                            # if the previous is no end (then neither + nor - 0), then take after new
+                            if (closerInTime[closestInTime]["diffend"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffend"] - diff) != 0:
+                                closestInTime = counter
+                        # larger
+                        elif referenceYear > yearBegin:
+                            # if the previous one is no beginning (then neither + nor - 0), then move to new one
+                            if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffbegin"] - diff) != 0:
+                                closestInTime = counter
+                    if int(i["diffend"]) < diff:
+                        diff = int(i["diffend"])
+                        closestInTime = counter
+                    elif int(i["diffend"]) == diff:
+                        # search the beginning of the year
+                        yearEnd = i["end-year"]
+                        # if reference period smaller than diffbegin
+                        if referenceYear <= yearEnd:
+                            # take this if the previous (closestInTime) is a start or no end
+                            if (closerInTime[closestInTime]["diffend"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffend"] - diff) != 0:
+                                closestInTime = counter
+                        # larger
+                        elif referenceYear > yearEnd:
+                            # take this if the previous one is not a beginning
+                            if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffbegin"] - diff) != 0:
+                                closestInTime = counter
+                                # object with the closest reference time is selected
+                # if the reference time is the same, the last object is selected (<=)
+                # Reason: In several regularly occurring special cases (e.g. some places in Poznan) the right one is rather behind
+                govid = closerInTime[closestInTime]["object"]
+    return ("None")
+
+
+def callWebservice(govid, client):
+    """
+    This function calls the GOV webservice.
+    Eine Internetverbindung ist notwendig.
+    :param govid: GOV identifier
+    :param client: connection to the GOV-Webservice
+    :return: information of the GOV about the corresponding GOV identifier
+    """
+    gotObject = client.service.getObject(govid)
+    return (gotObject)
+
+
+def begincalculator(data):
+    """
+    This function converts the timespan data of an object (available as Julian date) into a year number, which describes the beginning of the affiliation.
+    :param data: time information about administrative affiliations
+    :return: year as integer
+    """
+    timespan = data["timespan"]
+    if timespan["begin"] is not None:
+        begin = timespan["begin"]
+        jd = begin["jd"] - 2400000  # julian date
+        yearBegin = julian.from_jd(jd, fmt='mjd')
+        yearBegin = int(yearBegin.year)  # must be int to compare it
+    else:
+        yearBegin = 1  # then set the start to a very early year
+    return (yearBegin)
+
+
+def endcalculator(data):
+    """
+    This function converts the timespan data of an object (available as Julian date) into a year number, which describes the end of membership.
+    :param data: time information about administrative affiliations
+    :return: year as integer
+    """
+    timespan = data["timespan"]
+    if timespan["end"] is not None:
+        end = timespan["end"]
+        jd = end["jd"] - 2400000  # julian date
+        yearEnd = julian.from_jd(jd, fmt='mjd')
+        yearEnd = int(yearEnd.year)  # must be int to compare it
+    else:
+        yearEnd = 9999  # then set the end to a very late year
+    return (yearEnd)
+
+
+def mainProvinceFinder(resultPlaceFinder, filename, client, time):
+    """
+    This function assigns the identified urban names to a historical province.
+    :param resultPlaceFinder: list of dictionaries, which contains the identification for each location
+    :param filename: name of the file/source
+    :param client: connection to the GOV-Webservice
+    :param time: year to which an administrative assignment should be made
+    :return: list of dictionaries containing urbanonym, source, GOV-identifier and assigned provinces
+    """
+
+    # perform clustering for each urbanonym of the identification
+    provincesDictList = []
+    for counter, i in enumerate(resultPlaceFinder):
+        # only edit entries that match the source
+        if i["filename"] != filename:
+            continue;  # only happens with data loaded from CSV
+        govid = i["id"]  # GOV identifier
+        # if identification has failed, then clustering cannot be successful
+        if govid != "NONE":
+            # trigger clustering if identification is successful
+            resultProvinceFinder = provinceFinder(govid, time, client)
+        else:
+            resultProvinceFinder = "NONE"
+        provincesDict = {
+            "original name": i["original name"],
+            "filename": i["filename"],
+            "id": govid,
+            "province": resultProvinceFinder
+        }
+        provincesDictList.append(provincesDict)
+    return (provincesDictList)
diff --git a/2022_005_goldberg/Skripte/qualitychecker.py b/2022_005_goldberg/Skripte/qualitychecker.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a7725df774bcdc66174374c8474e28d97fc6f2
--- /dev/null
+++ b/2022_005_goldberg/Skripte/qualitychecker.py
@@ -0,0 +1,384 @@
+from haversine import haversine
+
+
+def prePlaceFinder(data, minigov, fileName):
+    """
+    This function creates a list of all unique urban names within a source.
+    The function works only with GEDCOM files as source (in this specification).
+    If other files are available as sources, an adjustment is necessary here.
+    :param data: content of one GEDCOM file
+    :param minigov: list of merged entries of the Mini-GOV
+    :return: list of uniquely identifiable locations (based on the name without context)
+    """
+    placelist = []
+    for gedcomRow in data:
+        if "PLAC" in gedcomRow:
+            # overwrite the row by deleting the tag information
+            # +5 because "PLAC" has four characters followed by a space
+            gedcomRow = gedcomRow[gedcomRow.find("PLAC") + 5:]
+            # searching in the Mini-GOV
+            minigovSearchResult = minigovSearch(minigov, gedcomRow)
+            rowInMiniGOV = minigovSearchResult[0]
+            # if there was a unique match, rowInMiniGOV is unequal -1
+            if rowInMiniGOV != -1:
+                govId = minigov[rowInMiniGOV]["GOV-Kennung"]  # GOV id of the detected place
+                longitude = minigov[rowInMiniGOV]["geographische Länge"]  # longitude
+                latitude = minigov[rowInMiniGOV]["geographische Breite"]  # latitude
+            else:  # with no clear hit
+                govId = "NONE"
+                longitude = "NONE"
+                latitude = "NONE"
+            # extend a list of places each with a dictionary
+            placeDict = {
+                "place": gedcomRow,
+                "govid": govId,
+                "longitude": longitude,
+                "latitude": latitude,
+                "filename": fileName,
+                "numberHits": minigovSearchResult[1]  # can be "1", ">1" or "<1"
+            }
+            placelist.append(placeDict)
+    return (placelist)
+
+
+def gedcomRowParser(data, counter):
+    """
+    The function parses GEDCOM rows into their individual components.
+    :param data: content of a GEDCOM file
+    :param counter: number of GEDCOM row
+    :return: list of four elements (first character, content behind first char, tag, content behind tag)
+    """
+    # save first character
+    firstChar = data[counter][:1]
+
+    # content after the first character excluding the first space
+    behindFirstChar = data[counter][2:]
+
+    # parsing of the tag
+    # if there is no further text behind the tag, then there is no space
+    try:
+        tag = behindFirstChar[:(behindFirstChar.index(" "))]
+    except ValueError:
+        tag = behindFirstChar
+
+    # content behind the tag
+    try:
+        behindTag = behindFirstChar[((behindFirstChar.index(" ")) + 1):]
+    except ValueError:
+        behindTag = ""
+    return ([firstChar, behindFirstChar, tag, behindTag])
+
+
+def minigovSearch(minigov, value):
+    """
+    This function searches the Mini-GOV for location names.
+    :param minigov: list of merged entries of the Mini-GOV
+    :param value: name of the urbanonym
+    :return: List with two values (1. contains the line number in the Mini-GOV if the search result is unique, otherwise -1; 2. contains how many hits were found)
+    """
+    # name of the column of the Mini-GOV to be searched
+    key = "aktueller Name"
+
+    # initial base cleanup of the place name
+    # cut off everything from the first comma
+    try:
+        valueCleaned = value[:value.index(",")]
+    except ValueError:
+        valueCleaned = value
+
+    # initialization of a list in which the line numbers of matching Mini-GOV entries are collected
+    hitsNumberList = []
+    # initialization of a list in which the urbanonyms of matching Mini-GOV entries are collected
+    hitsUrbanonymList = []
+
+    # Binary search algorithm for searching the Mini-GOV
+    # initial position is the center of the Mini-GOV
+    position = int(len(minigov) / 2)
+    # position value of the previous iteration
+    # initially not 0, because this would lead to complex numbers in the formulas (roots of negative numbers)
+    previousPosition = len(minigov)
+    # search until the distance to the previous position is less than 10
+    while (previousPosition - position) not in range(-10, 10):
+        previousPositionCache = position  # temporary storage, because position changes and the previous value prevoiusPosition is still needed
+        if valueCleaned > minigov[position][key]:  # alphabetical comparison
+            position = position + int(
+                abs(previousPosition - position) / 2)  # amount of the difference between previousPosition and pos / 2
+        elif valueCleaned < minigov[position][key]:  # alphabetical comparison
+            position = position - int(
+                abs(previousPosition - position) / 2)  # amount of the difference between previousPosition and pos / 2
+        elif valueCleaned == minigov[position][key]:  # alphabetical comparison, equalness
+            break;  # it can not get any more precise than that, so do a break
+        previousPosition = previousPositionCache
+    # if a position was found, the 30 values above and below this position should be compared with valueCleaned
+    # no place name occurs 60 times, therefore the 30 is chosen
+    try:
+        minigov[position - 30][key]
+        start = position - 30
+    except IndexError:  # which occurs when the position is quite far ahead
+        start = 0  # then start at the beginning
+    try:
+        minigov[position + 30][key]
+        end = position + 30
+    except IndexError:  # which occurs when the number is quite far back
+        end = len(minigov)  # then end the search at the end
+    # compare from start to finish if the value from the Mini-GOV matches the name of the source
+    for i in range(start, end):
+        if minigov[i][key] == valueCleaned:
+            hitsNumberList.append(i)
+            hitsUrbanonymList.append(valueCleaned)
+
+    # if only one unique value is found, pass the line number from the Mini-GOV and the information that there was only one hit
+    if len(hitsNumberList) == 1:
+        return ([hitsNumberList[0], "1"])
+    # with more than one hit it should be made clear with -1 that no clear hit was achieved
+    elif len(hitsNumberList) > 1:
+        return ([-1, ">1"])
+    # with less than one hit, -1 should be used to indicate that no clear hit was achieved
+    elif len(hitsNumberList) < 1:
+        return ([-1, "0"])
+
+
+def qualityChecker(content, placelist, previousQualityData, filename):
+    """
+    This function is used to get qualitative parameters for each GEDCOM file.
+    This includes, for example, the information about the number of persons.
+    In this program, the determined data is also called metadata of the source.
+    :param content: contents of the GEDCOM file
+    :param placelist: list of uniquely identifiable locations in the source
+    :param previousQualityData: source metadata from previous processing runs
+    :param filename: name of GEDCOM file
+    :return: list of quality parameters or a string as termination condition if the file has already been processed
+    """
+    # prepare qualitychecker()
+    minOneFileIsMissing = 0  # binary variable for detection of missing files in result-csv; if is is 1, min. one gedcom file is not in the csv
+
+    # check if the file has been edited before
+    # if nothing is found, qualityIndex is None, otherwise the number of the row is contained
+    qualityIndex = next((index for (index, d) in enumerate(previousQualityData) if d["filename"] == filename), None)
+    if qualityIndex is not None:
+        # define a string as termination condition
+        # if the file has already been edited once, it should not be edited a second time
+        quality = "StartingExitStrategy"
+        return (quality)
+    else:  # file has not been edited yet
+        # initialising of variables
+        numberOfCoordinates = 0  # counter how many unique coordinates in file
+        numberOfPLAC = 0  # counts the number of PLAC tags
+        latitude = 0
+        longitude = 0
+        numberClusters = 1  # number of created clusters
+        haversineDict = {  # for clustering, initial always (0,0) as first value
+            "coordinates": (0, 0),  # it is okay, because it is in the sea and all places are far away
+            "cluster": 0,
+            "filename": 0
+        }
+        clusterList = [haversineDict]
+
+        # call each line of the GEDCOM file in sequence
+        for i in range(len(content)):
+            # parse a GEDCOM line
+            resultParser = gedcomRowParser(content, i)
+            tag = resultParser[2]
+            behindTag = resultParser[3]
+
+            # cleanup of the content; removal of the content from the first comma
+            try:
+                behindTag = behindTag[:behindTag.index(",")]
+            except ValueError:
+                behindTag = behindTag
+
+            # if they are urbanonyms, calculate average coordinates
+            if tag[:4] == "PLAC":
+                numberOfPLAC = numberOfPLAC + 1
+                # formation of clusters of unique values
+                # compare with any unique location in placelist
+                # it is no problem to identify the placelist by the place name, because placelist has by definition only unique values (e.g. 2x Berlin does not work)
+                for placePlacelist in placelist:
+                    # comparing
+                    if behindTag == placePlacelist["place"] and placePlacelist["longitude"] != "NONE" and \
+                            placePlacelist["longitude"] != "":
+                        # add coordinates and a number variable
+                        longitude = longitude + float(placePlacelist["longitude"])  # are still strings
+                        latitude = latitude + float(placePlacelist["latitude"])
+                        numberOfCoordinates = numberOfCoordinates + 1
+
+                        # clustering of placePlacelist
+                        clusterListCache = clusterList  # otherwise the list in the loop itself extends infinitely
+                        # list of possible clusters for a location
+                        clusterAffiliation = []  # must be reset, because otherwise clusters are always found
+                        # checking whether an existing cluster is less than 50 km away from a location
+                        for singleCluster in clusterListCache:
+                            if singleCluster[
+                                "cluster"] not in clusterAffiliation:  # should not have to examine all other elements of the cluster
+                                coordPlace = (float(placePlacelist["latitude"]), float(placePlacelist["longitude"]))
+                                coordMeanCluster = singleCluster["coordinates"]
+                                # calculation of the distance in kilometers between location and possible other locations in clusters
+                                distance = haversine(coordPlace, coordMeanCluster)
+                                if distance <= 50:  # in kilometers, at zero it is a place that already exists
+                                    # if the location is closer than 50 km to an existing cluster, the cluster is assigned to a list of possible clusters
+                                    # a location can belong to several clusters and thus connect them
+                                    clusterAffiliation.append(singleCluster["cluster"])
+
+                        # with only one membership it will be added to the cluster
+                        if len(clusterAffiliation) == 1:
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": clusterAffiliation[0],
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)  # add to existing cluster
+                        # more than one cluster is close under 50 km (clusters are merged)
+                        elif len(clusterAffiliation) > 1:
+                            # select the cluster name to be kept, which is decisive (the lowest)
+                            min = clusterAffiliation[0]
+                            for singleClusterAffiliation in clusterAffiliation:
+                                if singleClusterAffiliation < min:
+                                    min = singleClusterAffiliation
+                            for singleClusterAffiliation in clusterAffiliation:
+                                # all other cluster entries that are not min must be renamed to min
+                                if singleClusterAffiliation != min:
+                                    for singleClusterList in clusterList:
+                                        if singleClusterList["cluster"] == singleClusterAffiliation:
+                                            singleClusterList["cluster"] = min  # value vom dict ändern
+                            # dthen create the new entry for the location
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": min,
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)
+                        # no affiliation to a cluster
+                        # own cluster is created
+                        elif len(clusterAffiliation) == 0:
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": numberClusters,
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)
+                            numberClusters = numberClusters + 1  # count the total number of clusters
+
+                        # if there was a hit once, there can be no second hit, because placelist has only unique values; coordinates that occur twice are included twice in the calculation, because the whole part is executed multiple times
+                        break
+
+        # calculate average coordinates of whole source
+        if numberOfCoordinates != 0:  # non-negative condition
+            longitude = longitude / numberOfCoordinates
+            latitude = latitude / numberOfCoordinates
+        else:
+            longitude = "NONE"
+            latitude = "NONE"
+
+        # per GEDCOM file
+        # calculate number of different clusters
+        existingCluster = []  # list of assigned clusters
+        clusterMeanList = []  # list of averages of all clusters in a file for further processing
+        numberOfFinalCluster = 0
+        # save only the numberClusters from the clusterlist
+        for singleClusterList in clusterList:
+            existingCluster.append(singleClusterList["cluster"])
+
+        # save only the coordinates from the clusterlist
+        # set a dot per file location
+        clusterLatLon = []
+        for singleClusterList in clusterList:
+            clusterLatLon.append(
+                [singleClusterList["coordinates"][0], singleClusterList["coordinates"][1], 500])  # regulate intensity
+
+        # calculation of the geographical center of the clusters in a file
+        # definition of a minimum size of locations in a cluster
+        minimumClusterSize = 6  # do not consider all clusters smaller or equal 5
+        # per cluster, basic idea: compare every location with every cluster
+        # create numbers, which can be used as names for the clusters
+        for possibleClusterNumber in range(len(clusterList)):
+            # initialization for the calculation of averages
+            lat = 0
+            long = 0
+            numberLatLong = 0
+            # add coordinates of matching clusters together
+            for singleClusterList in clusterList:
+                if singleClusterList["cluster"] == possibleClusterNumber:
+                    lat = lat + singleClusterList["coordinates"][0]
+                    long = long + singleClusterList["coordinates"][1]
+                    numberLatLong = numberLatLong + 1
+            # via numberLatLong you can exclude small clusters; must be at least 1; must be at least 1
+            # only for clusters that really exist, therefore at least 1
+            if numberLatLong >= minimumClusterSize:  # must go here, because otherwise the divider will be distorted and also clusters will be applied where there is no cluster entry (e.g. 23)
+                lat = lat / (numberLatLong)  # non-negative
+                long = long / (numberLatLong)  # non-negative
+                # the list is used for further calculations to determine/cluster locations
+                clusterMeanList.append([lat, long])
+
+        # counting of left clusters (cluster with the minimum size)
+        existingCluster = stringDuplicateCounter(existingCluster)
+        for singleExistingCLuster in existingCluster:
+            if singleExistingCLuster[1] >= minimumClusterSize:
+                numberOfFinalCluster = numberOfFinalCluster + 1
+
+        # counting hits
+        noHit = 0
+        moreThanOneHit = 0
+        oneHit = 0
+        for singlePlacelist in placelist:
+            if singlePlacelist["numberHits"] == "1":
+                oneHit = oneHit + 1  # in contrast to numberOfCoordinates also those without coordinates
+            elif singlePlacelist["numberHits"] == "0":
+                noHit = noHit + 1
+            elif singlePlacelist["numberHits"] == ">1":
+                moreThanOneHit = moreThanOneHit + 1
+
+        # generation of a dictionary for all meta data
+        quality = {
+            "filename": filename,
+            "longitude mean of of definitely coordinates": longitude,
+            "latitude mean of of definitely coordinates": latitude,
+            "number of places": numberOfPLAC,
+            "number of noHit": noHit,
+            "number of moreThanOneHit": moreThanOneHit,
+            "number of definitely coordinates": oneHit,
+            "number of existing clusters": (len(existingCluster) - 1),  # minus 1, because cluster is initial 0
+            "number of relevant clusters": numberOfFinalCluster,
+            "cluster midpoints": clusterMeanList
+        }
+    return (quality)
+
+
+def stringDuplicateCounter(list):
+    """
+    This function is used to count equal values (duplicates) in lists.
+    :param list: list to be examined (one column)
+    :return: list of lists containing the name and number of each element in the list
+    """
+    newList = []
+    attribute = []
+    # examine each element of "list"
+    for counter, i in enumerate(list):  # i is an element of the list
+        # for each new element this step is performed
+        if i not in attribute:
+            # count the number of these elements in the list
+            doublingCounter = 0
+            for y in list:
+                if i == y:
+                    doublingCounter = doublingCounter + 1
+            newList.append([[i], doublingCounter])
+            attribute.append(i)
+    # alphabetical sorting
+    newList.sort()
+    return (newList)
+
+
+def mainMetadataInspector(line, filename, miniGovList, previousQualityData):
+    """
+    This function first initializes the creation of a list of unique location information in a source.
+    Afterwards it is used to achieve a further analysis of metadata or qualitative features.
+    :param line: contents of the GEDCOM file
+    :param filename: name of the file/source
+    :param miniGovList: list of merged entries of the Mini-GOV
+    :param previousQualityData: source metadata from previous processing runs
+    :return: content for one line of the file "quality.csv"
+    """
+    # creation of a list with unique places
+    placelist = prePlaceFinder(line, miniGovList, filename)  # placelist is list of unique locations with coordinates
+    # metadata analysis, calculate some parameters
+    quality = qualityChecker(line, placelist, previousQualityData, filename)
+    return (quality)
diff --git a/2022_005_goldberg/Skripte/readme.txt b/2022_005_goldberg/Skripte/readme.txt
index ce2cf0970c947c9fae29dd0a330a83fff367d9d8..ca1e245a476fb2c33fe4f4d91b713b74041ae63a 100644
--- a/2022_005_goldberg/Skripte/readme.txt
+++ b/2022_005_goldberg/Skripte/readme.txt
@@ -1,45 +1,27 @@
 readme
 
-Die folgende Anleitung soll eine Benutzung des Python-Skripts und eine Interpretation der Ergebnisse ermöglichen.
+Die folgende Anleitung soll eine Benutzung des Python-Skripts und eine Interpretation der Ergebnisse ermöglichen. Der Programmcode ist kompatibel mit der Python-Version 3.6.
 
 Bibliotheken:
-Damit das Programm ausgeführt werden kann sind ggf. noch weitere Bibliotheken lokal zu installieren. In den ersten Zeilen des Skripts sind die benutzten Bibliotheken angegeben.
-Eingangsdateien mit Berufsangaben:
-Das Programm ist darauf ausgelegt, zwei verschiedene Arten von Eingangsdateien zu bearbeiten: (1.) CSV-Dateien und (2.) GEDCOM-Dateien. Je nachdem welche Art vorliegt ist im Programm der Parameter „typeOfData“ im Programmcode auf „csv“ oder „ged“ zu setzen. 
-Liegen die Berufsangaben in einer CSV-Datei vor, so ist diese so zu strukturieren, dass sie eine Spalte enthält, in dessen erster Zeile die Überschrift „occupation“ steht. In den folgenden Zeilen folgen jeweils die zu lemmatisierten Berufsangaben. Am Ablageort des Skripts muss auch Ordner „data“ existieren, in dem die Datei ablegt ist. Sie trägt die Bezeichnung „occupations.csv“.
-Falls die Berufsangaben in GEDCOM-Dateien vorliegen, so sind die GEDCOM-Dateien mit fortlaufenden Ziffern zu benennen („1.ged“, „2.ged“ etc.). Ziffern dürfen nicht doppelt genutzt werden. Auch diese Dateien werden im Unterordner „data“ platziert.
+Damit das Programm ausgeführt werden kann sind ggf. noch weitere Bibliotheken lokal zu installieren. In den ersten Zeilen der jeweiligen Dateien sind die benutzten Bibliotheken angegeben.
 
-Variantenliste:
-Wie die Eingangsdatei mit den neuen Berufsangaben wird auch die CSV-Datei mit den bestehenden Varianten dem Unterordner „data“ hinzugefügt.
-Die Bezeichnung der Datei muss "variants.csv" sein. Sie enthält drei Spalten, die die Überschriften „variant“ und „OhdAB_01“ tragen. In der ersten Spalte steht die textuelle Bezeichnung und in der zweiten der zugeordnete OhdAb-Cod. Falls ein anderes Klassifizierungssysteme angewendet wird, kann in der dritten Spalte auch jede beliebige Codierung genutzt werden – die Überschrift sollte dennoch nicht verändert werden.
+Eingangsdateien: 
+Das Programm verarbeitet Ortsangaben aus GEDCOM-Dateien. Die GEDCOM-Dateien sind mit fortlaufenden Ziffern zu benennen („1.ged“, „2.ged“ etc.). Ziffern dürfen nicht doppelt genutzt werden. Diese Dateien werden in einem Unterordner „data“ platziert. Wenn andere Quellen als GEDCOM-Dateien verwendet werden sollen, ist eine Veränderung des Programms notwendig. Es ist nicht ratsam, nur eine einzige Liste von Ortsangaben zu verwenden, da das Programm darauf basiert, Ortsangaben eines Kontextes in Beziehung zu setzten. Kontext bedeutet hierbei, dass diese Ortsangaben in einer Quelle gemeinsam (also in einem Kontext) benannt werden, was eine geographische Nähe impliziert. Für alle Ortsangaben eines Kontextes sollte also eine eigene Datei erstellt und verarbeitet werden.
+Im Unterordner „data“ sind auch die Dateien des Mini-GOVs zu finden. Hier sind standardmäßig die Mini-GOVs von Deutschland, Polen, Österreich, Schweiz, Tschechien, Dänemark, Frankreich und den Niederlanden eingebunden.
+Daneben versucht das Programm die Dateien „quality.csv“, „placefinder.csv“ und „provincesdict.csv“ zu öffnen, die im selben Ordner wie die Datei „main.py“ liegen. Das sind gleichzeitig die Ausgabedatei des Programms (siehe unten). Sind diese nicht vorhanden, werden diese neu erzeugt. Sind diese vorhanden, werden die vorhandenen Daten genutzt, um die bereits verarbeiteten GEDCOM-Dateien nicht noch einmal auszuführen. Das hilft vor allen in solchen Fällen, in denen das Programm zwischendurch aufgrund einer nicht kontinuierlich vorhandenen Internetverbindung abbricht (siehe nächster Abschnitt).
+Unstetige Internetverbindung: Das Programm greift auf den GOV-Webservice zu, um Informationen zu einzelnen Ortsangaben abzufragen. Hierzu ist eine dauerhafte Internetverbindung notwendig. Da es insbesondere über einen WLAN-Zugang aber Aussetzer geben kann, bei denen das Programm abbrechen kann, ist eine Verzögerung des Programms bei Internetstörungen einprogrammiert. Diese kann manuell an- und abgeschaltet werden. Die Variable withSleeping befindet sich in der Datei „provincefinder.py“ zu Beginn der Funktion „provinceFinder()“. Wenn sie auf 1 gesetzt wird und eine Verbindung zum Webservice nicht hergestellt werden kann, pausiert das Programm für eine Sekunde. Das führt zugleich allerdings dazu, dass das Programm insgesamt eine längere Durchlaufzeit in Anspruch nimmt. Standardmäßig ist diese Funktion nicht aktiviert.
 
 Parallelisierung:
-Die Verarbeitung von GEDCOM-Dateien läuft parallel ab, um die Geschwindigkeit zu erhöhen. Hierzu kann festgelegt werden, wie viele Rechnerkerne genutzt werden. Dazu ist der Parameter der Funktion „Pool()“ jeweils zu verändern.  Bleibt er leer, so werden alle verfügbare Rechenkerne genutzt. Im Skript ist die Anzahl der Kerne standardmäßig auf einen Kern festgelegt.
+Die Verarbeitung von GEDCOM-Dateien läuft parallel ab, um die Geschwindigkeit zu erhöhen. Hierzu kann festgelegt werden, wie viele Rechnerkerne genutzt werden. Dazu ist in der Main der Parameter „Pool()“ jeweils zu verändern. Bleibt er leer, so werden alle verfügbare Rechenkerne genutzt. Im Skript ist die Anzahl der Kerne standardmäßig auf die Nutzung aller verfügbaren Kerne eingestellt.
+Provinzenzuordnung: Die Ortsangaben werden verschiedenen Provinzen zugeordnet. Im Standard sind in der Datei „provincefinder.py“ Provinzen vor 1871 und nach 1990 zugeordnet. Für die Zeit dazwischen ist eine provinzielle Zuordnung nicht möglich. Dieses kann aber beliebig angepasst und erweitert werden. Die Bezugszeit kann in der Main in der Funktion „parallel()“ über die Variable referencetime geändert werden. Sie ist standardmäßig auf das Jahr 1800 eingestellt.
 
-Halbierung der Varianten:
-Um die Halbierung der Varianten zu erreichen ist die Variable halving auf "yes" zu setzen. Diese Möglichkeit dient vorwiegend zu Testzwecken.
-Weitere Iterationen mit den neuen Varianten:
-Unter Hinzuziehung der neu lemmatisierten Berufsvarianten ist es wahlweise möglich weitere Berufsvarianten zu bearbeiten, zu der es in der ursprünglichen Variantenliste keinen Treffer gibt. Im Standard sind drei Iterationen angelegt. Sollen diese nicht stattfinden, sind die Code-Bestandsteile hinter dem Kommentar „second processing“ bzw. „third processing“ auszuklammern.
+Cluster:
+Die Clusterbildung von Orten nimmt bei deren Identifizierung eine bedeutende Rolle ein. Der Mindestabstand sowie die Mindestanzahl von Orten in einem Cluster kann dabei variiert werden. Der Mindestabstand zwischen zwei Clustern kann in der Datei „qualitychecker.py“ in der Funktion „qualityChecker()“ über die IF-Abfrage „if distance <= 50:“ geändert werden. In derselben Funktion existiert die Variable minimumClusterSize, über die die Mindestgröße eines Clusters variiert werden kann. Standardmäßig ist diese auf 6 Orte eingestellt.
 
 Ausgabedateien:
-Als Ergebnis wird die Datei „occuResult.csv“ erzeugt und während des Programmdurchlaufs fortlaufend aufgebaut. Die jeweiligen Spalten sind mit Tabstopps voneinander separiert. Diese enthält die Bezeichnung der überprüften Variante („variant“), bei GEDCOM-Dateien den Namen der Datei, in der diese vorkam („source“), die Anzahl der Häufigkeit dieser Variante in dieser Quelle („number“). In den weiteren Spalten finden sich für die einzelnen Berufsangaben (occupation1-occupation5) in der Bezeichnung verschiedene Informationen. Es werden maximal fünf einzelne Berufe identifiziert (z. B. aus der Angabe „Häusler und Fleischer und Gastwirt und Richter und Schenker“). Die Informationen sind wie folgend gegliedert:
-
-Schema: Bezeichnung - Erläuterung
-occupation - bereinigte Berufsbezeichnung
-KldB 2010 - OhdAB-Code, falls eine Zuordnung geschehen kann
-best fit lemma - Bezeichnung der am besten passenden Variante
-row of best fit lemma - Zeile der am besten passenden Variante in der Variantenliste
-titel - aus der ursprünglichen Berufsangabe rausgefilterte Titularangabe
-role - aus der ursprünglichen Berufsangabe rausgefilterte Rollenangabe
-year - aus der ursprünglichen Berufsangabe rausgefilterte Jahresangabe
-url - aus der ursprünglichen Berufsangabe rausgefilterte Angabe einer URL
-location - aus der ursprünglichen Berufsangabe rausgefilterte Angabe eines Ortes
-further info - aus der ursprünglichen Berufsangabe rausgefilterten sonstigen Angaben (waren in Klammern vorhanden)
-selection info - Information, ob die Berufsangabe einer bestehenden Variante zugeordnet werden konnte („found direct“, „not found“, „found after levenshtein“, „no occupational designation“)
-absolute distance - absolute Levenshtein-Distanz zur am besten passenden Variante
-relative distance - relative Levenshtein-Distanz zur am besten passenden Variante 
-
-Zudem wird im Unterordner „data“ eine Datei „newVariants.csv“ erzeugt, die die Bezeichnung der neuen Variante („variant“), die Bezeichnung der bereits bestehenden Variante („lemma“) sowie den Code der OhdAB („OhdAB_01“) enthält.
-
-
-Jan Michael Goldberg, 22. Februar 2022
\ No newline at end of file
+Das Programm produziert drei Dateien, in der die einzelnen Spalten per Tabstopp voneinander getrennt sind. Die Datei „quality.csv“ gibt Auskunft über die Beschaffenheit und Qualität der Informationen in den GEDCOM-Dateien. Pro GEDCOM-Datei existiert eine Zeile mit Angaben zum Dateinamen, der Anzahl der Ortsangaben in der Datei, dann diese Anzahl der Ortsangabe aufgeteilt in Orte ohne Treffer (noHit), Orte mit mehr als einem Treffer (moreThanOneHit) und Orte mit genau einem Treffer (definitely coordinates), den Mittelpunkt der Längen- sowie der Breitengrade, die Anzahl existierender Cluster, die Anzahl relevanter Cluster, sowie eine Liste der Koordinaten der Mittelpunkte relevanter Cluster.
+Die Datei „provincesdict.csv“ enthält vier Spalten: Die unveränderte Ortsbezeichnung einer Quelle, den Dateinamen, die GOV-ID und die zugeordnete Provinz. Sie hat den Zweck, dass doppelt vorkommende Ortsbezeichnungen in einer Datei nicht doppelt verarbeitet werden müssen.
+Die Datei „placefinder.csv“ enthält zu jeder Ortsangabe Informationen über die ID (GOV-ID), die Koordinaten, eine Information wie die Zuordnung zur GOV-ID stattgefunden hat, die bereinigte Version des Ortsnamens, den originalen Ortsnamen sowie den Namen der Datei, in der der Name vorkommt
+
+
+Jan Michael Goldberg, 30. Juni 2022
\ No newline at end of file