Neue Datei hochladen

e3e8b6b3 · Marcus Baumgarten · 7a54214a · e3e8b6b3
Commit e3e8b6b3 authored 2 years ago by Marcus Baumgarten
--- a/placefinder.py
+++ b/placefinder.py
+from Levenshtein import distance
+from haversine import haversine
+import copy
+import qualitychecker
+def placeFinder(locNameClean, miniGOV, gedcomMetaInfos, bannedObjectTypes):
+    """
+    This function prepares urban names for identification.
+    :param locNameClean: name of place after cleansing
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param gedcomMetaInfos: content for one line of the file "quality.csv"
+    :param bannedObjectTypes: list of banned object types
+    :return: list of GOV identifier, longitude, latitude, and information about identification process
+    """
+    # searching in with row of gedcomMetaInfos the information to the actual file is in
+    clusterMeanList = gedcomMetaInfos["cluster midpoints"]
+    # initiate find() to return some values of parameter of the seltected object
+    resultOfFind = find(miniGOV, locNameClean, clusterMeanList, bannedObjectTypes)
+    positionMiniGOV = resultOfFind[0]  # number of row of selected object in Mini-GOV
+    selectInfo = resultOfFind[1]  # information about the way of selecting/identifiying object
+    # if find() was not successfull then selectedRowOfMiniGOV is -1 and parameter should named with "NONE"
+    if positionMiniGOV != -1:
+        govid = miniGOV[positionMiniGOV]["GOV-Kennung"]  # ID of GOV object
+        longitude = miniGOV[positionMiniGOV]["geographische Länge"]  # longitude
+        latitude = miniGOV[positionMiniGOV]["geographische Breite"]  # latitude
+        return [govid, longitude, latitude, selectInfo]
+    govid = "NONE"
+    longitude = "NONE"
+    latitude = "NONE"
+    return [govid, longitude, latitude, selectInfo]
+def find(miniGOV, locNameClean, clusterMeanList, bannedObjectTypes):
+    """
+    This function identifies an adjusted urbanonym.
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param locNameClean: name of place/urbanonym after cleansing
+    :param clusterMeanList: list of means of coordinates for the clusters in a source
+    :param bannedObjectTypes: list of banned object types
+    :return: index of the line in "miniGOV" of the identified location and information about the type of identification
+    """
+    # headline of the column with the relevant information (acutal name) ob objects
+    keyMiniGOV = "aktueller Name"
+    # banned characteristics of value
+    if locNameClean == "unrealisticSequenceOfStringsAlpha":
+        selectInfo = "Not selected on the basis of prohibited content."
+        return ([-1, selectInfo])
+    elif locNameClean == "unrealisticSequenceOfStringsBeta":
+        selectInfo = "Not selected based on prohibited specification."
+        return ([-1, selectInfo])
+    # define rank order of some types (priority)
+    # if there are several matches, it is more likely to be a "Stadt" (more urban) than a "Ort" (more rural)
+    orderRankObjectTypes = ["Kreisfreie Stadt",
+                            "Stadt",
+                            "Dorf",
+                            "Pfarrdorf",
+                            "Ort",
+                            "Ortsteil",
+                            "Ortschaft",
+                            "Wohnplatz",
+                            "Weiler"]
+    # cleaned location data can contain several urbanonyms (z. e.g. places in brackets, hyphen as separation)
+    # these must be checked individually if one of them cannot be identified
+    # "valueList" contains all sub-urbanonyms to be examined and starts with the entire cleaned-up designation
+    valueList = [locNameClean]
+    # search for hyphens
+    if "-" in locNameClean:
+        positionMinus = locNameClean.find("-")
+        # first add what comes before the hyphen
+        valueList.append(locNameClean[:positionMinus])
+        valueList.append(locNameClean[positionMinus + 1:])
+    # search for brackets
+    if "(" in locNameClean and ")" in locNameClean:
+        positionBreakedOpen = locNameClean.find("(")
+        positionBreakedClose = locNameClean.find(")")
+        # first add what is not in brackets
+        # assumption: the brackets are not in front and have a space before (therefore -1)
+        valueList.append(locNameClean[:positionBreakedOpen - 1])
+        valueList.append(locNameClean[positionBreakedOpen + 1:positionBreakedClose])
+    # if no indication of it can be identified, then check the part up to the first space
+    if " " in locNameClean:
+        positionSpace = locNameClean.find(" ")
+        valueList.append(locNameClean[:positionSpace])
+    # testing of the different urbanonym components
+    # if anything can be identified, the loop is terminated and not all loop passes are needed
+    for counter, newLocValueClean in enumerate(valueList):
+        # first, the unadjusted urbanonym checks the information before the comma
+        # binary search algorithm begins here
+        position = int(len(miniGOV) / 2)  # start in the middle of the Mini-GOV
+        # initial not 0, because otherwise it leads to complex numbers
+        furtherPosition = len(miniGOV)
+        # execute loop until the new position is only 10 lines away from the old one
+        while (furtherPosition - position) not in range(-10, 10):
+            positionCache = position
+            # designation from the Mini-GOV must be converted to lower case
+            if newLocValueClean > miniGOV[position][keyMiniGOV].lower():
+                # amount of difference between "furtherPosition" and "position" / 2
+                position = position + int(abs(furtherPosition - position) / 2)
+            elif newLocValueClean < miniGOV[position][keyMiniGOV].lower():
+                # amount of difference between "furtherPosition" and "position" / 2
+                position = position - int(abs(furtherPosition - position) / 2)
+            elif newLocValueClean == miniGOV[position][keyMiniGOV].lower():
+                break;  # runtime improvement, it cannot be more precise
+            furtherPosition = positionCache
+        # search for a match 30 lines before and after the found position; find this positions
+        # looks at the 30 in front and behind, if there are e.g. 60 places with the same name
+        try:
+            miniGOV[position - 30][keyMiniGOV]
+            startPosition = position - 30
+        except IndexError:  # the error occurs when the number is in front
+            startPosition = 0
+        try:
+            miniGOV[position + 30][keyMiniGOV]
+            endPosition = position + 30
+        except IndexError:  # the error occurs when the number is behind
+            endPosition = len(miniGOV)
+        # initialising of lists
+        equalList = []  # values that are equal
+        similarList = []  # values that are similar
+        # similarity analysis
+        if newLocValueClean != "":  # similarity analysis makes sense, if "newLocValueClean" is not empty
+            # creation of a list with the results
+            simularityList = []
+            # check each position 30 lines before and after the previously found line
+            for i in range(startPosition, endPosition):
+                # use of the levenshtein distance for equality checks
+                levenshteinDistance = distance(miniGOV[i][keyMiniGOV].lower(), newLocValueClean)
+                simularityList.append([i,  # index (acutal position in Mini-GOV)
+                                       levenshteinDistance,  # absolute levensthein distance
+                                       levenshteinDistance / len(newLocValueClean),  # relative levenshtein distance
+                                       miniGOV[i][keyMiniGOV].lower(),  # comparative Mini-GOV designation
+                                       newLocValueClean  # comparative urbanonym
+                                       ])
+            # search for hits where the Levenshtein Distance was 0 (equality)
+            for i in simularityList:
+                # if levenshteinDistance is 0 then both strings are the same
+                position = i[0]
+                levenshteinDistance = i[1]
+                if levenshteinDistance == 0:
+                    equalList.append(position)  # equalList contains only line numbers
+            # if there is none with the levenshteinDistance 0, then check if there are hits with an relative levvenshtein distance of 0.17
+            if len(equalList) == 0:
+                for i in simularityList:
+                    if i[2] <= 0.17:
+                        similarList.append(i[0])  # similarList contains only line numbers
+        # check length of equalList and similarList
+        # "equalList" has a priority over "similarList"
+        # "selectInfo" explains if and how an identification takes place
+        if len(equalList) == 0:
+            # no same hit but exactly one similar hit
+            if len(similarList) == 1:
+                # even if there is only one hit, it must not have a banned object type
+                if miniGOV[similarList[0]]["Objekttyp als Zahl"] in bannedObjectTypes:
+                    selectInfo = "Not selected because nothing was found in the Mini-GOV (with similarity analysis)"
+                    return ([-1, selectInfo])
+                else:
+                    selectInfo = "Selected based on a single matching hit in the similarity analysis"
+                    return ([similarList[0], selectInfo])
+            # no same hit but more then one similar hit
+            elif len(similarList) > 1:
+                # start a selection
+                resultAreaSearch = areaSearch(similarList,
+                                              "(with similarity analysis)",
+                                              miniGOV,
+                                              clusterMeanList,
+                                              bannedObjectTypes,
+                                              orderRankObjectTypes)
+                return (resultAreaSearch)  # return value has the same structure as the previous
+            # no equal or similar hit
+            # should still be able to do the next cycle and will only return a value if the last element of the "valueList" was checked
+            elif len(similarList) == 0 and (counter + 1) == len(valueList):
+                selectInfo = "Not selected because nothing was found in the Mini-GOV (with similarity analysis)"
+                return ([-1, selectInfo])
+        # exactly one hit in "equalList"
+        elif len(equalList) == 1:
+            selectInfo = "Selected based on a single matching hit"
+            return ([equalList[0], selectInfo])
+        # more then one hits in "equalList"
+        elif len(equalList) > 1:
+            resultAreaSearch = areaSearch(equalList,
+                                          "",
+                                          miniGOV,
+                                          clusterMeanList,
+                                          bannedObjectTypes,
+                                          orderRankObjectTypes)
+            return (resultAreaSearch)  # return value has the same structure as the previous
+    # if nothing is found until here, then return -1
+    selectInfo = "Nothing selected because nothing was found in the Mini-GOV"
+    return ([-1, selectInfo])
+def areaSearch(similarList, supplementSelectInfo, miniGOV, clusterMeanList, bannedObjectTypes, orderRankObjectTypes):
+    """
+    This function selects one of several possible locations.
+    The basis for this is the distance to the other identified locations in the source.
+    :param similarList: list of line numbers in the Mini-GOV that match the urbanonym
+    :param supplementSelectInfo: text that can be appended to "selectInfo
+    :param miniGOV: list of all objects in the Mini-GOV
+    :param clusterMeanList: list of means of coordinates for the clusters in a source
+    :param bannedObjectTypes: list of banned object types
+    :param orderRankObjectTypes: list that defines rank order of some object types
+    :return: list of selected position and an information about the selection/identification process
+    """
+    # reading coordinates from the Mini-GOV and write them into coordList
+    coordList = []
+    for i in similarList:  # i is position in Mini-GOV
+        longitude = miniGOV[i]["geographische Länge"]
+        latitude = miniGOV[i]["geographische Breite"]
+        coordList.append([i, longitude, latitude])
+    # calculate similarity of values in "coordList"
+    geoDistanceList = []
+    for i in coordList:
+        # ignore entries without valid coordinates
+        if i[1] != "" and i[1] != "NONE" and i[2] != "" and i[2] != "NONE" and len(clusterMeanList) != 0:
+            # calculate a distance for each cluster center of the source and write it into a list
+            for j in clusterMeanList:  # clusterMeanList consists of "cluster midpoints"
+                # latitude coordinate 1, longitude coordinate 1, latitude coordinate 2, longitude coordinate 2
+                distance = haversine((float(i[2]), float(i[1])), (float(j[0]), float(j[1])))
+                geoDistanceList.append([i, distance])
+    # determination of the smallest distance
+    minimalDistance = 9999999.999  # some high initial value
+    for i in geoDistanceList:
+        newDistance = i[1]  # haversine in 1
+        if newDistance < minimalDistance:
+            minimalDistance = newDistance
+            positionMiniGOV = i[0][0]  # line number of the entry in the Mini-GOV that has the smallest distance
+    # only one value with coordinates remains
+    # not 1, but 1*cluster, because one is created for each cluster; inequality condition mandatory
+    if len(geoDistanceList) == 1 * len(clusterMeanList) and len(geoDistanceList) != 0:
+        selectInfo = "Selected because it was the only one with coordinates " + supplementSelectInfo
+    # several values remain, but the closest value is selected
+    elif len(geoDistanceList) > 1 * len(clusterMeanList):
+        selectInfo = "Selected on the basis of geographical proximity " + supplementSelectInfo
+    # no distance was determined
+    elif len(geoDistanceList) == 0:
+        # no one with geodistance there, but maybe I can exclude some others via the types
+        # creation of a list in which the unauthorized types are filtered out
+        noGeoDistButAllowedTypeList = []
+        for i in coordList:
+            position = i[0]
+            if miniGOV[position]["Objekttyp als Zahl"] not in bannedObjectTypes:
+                noGeoDistButAllowedTypeList.append(i)
+        # one object remains, chose this
+        if len(noGeoDistButAllowedTypeList) == 1:
+            selectInfo = "Selected based on the only valid type " + supplementSelectInfo
+            positionMiniGOV = noGeoDistButAllowedTypeList[0][0]
+        # no element is left over
+        elif len(noGeoDistButAllowedTypeList) == 0:
+            selectInfo = "None selected, because none has a valid type " + supplementSelectInfo
+            positionMiniGOV = -1  # must be described, because the variable has not yet been described
+        # several are left over
+        # selection via ranking order of the object types
+        else:
+            for objectTyp in orderRankObjectTypes:
+                # initialization of a list in which all elements of a type are written
+                objectTypeRankList = []
+                for elementCoordList in noGeoDistButAllowedTypeList:
+                    if miniGOV[elementCoordList[0]]["Objekttyp als Text"] == objectTyp:
+                        objectTypeRankList.append(elementCoordList[0])
+                    # one object remains, then select it
+                    if len(objectTypeRankList) == 1:
+                        positionMiniGOV = objectTypeRankList[0]
+                        selectInfo = "Selected on the basis of a suitable type " + supplementSelectInfo
+                        return ([positionMiniGOV, selectInfo])  # e. g. a city was found and preferred over a village
+                    # multiple hits, none can be selected
+                    elif len(objectTypeRankList) > 1:
+                        positionMiniGOV = -1
+                        selectInfo = "Not selected based on too many matching types " + supplementSelectInfo
+                        return ([positionMiniGOV, selectInfo])
+                    # if no hit, the loop is repeated with the next object type
+            # this part of the function is only executed if the identification has failed finally
+            selectInfo = "Not selected, because no heuristic gives a result " + supplementSelectInfo
+            positionMiniGOV = -1
+    return ([positionMiniGOV, selectInfo])
+def stringFunc1(behindTag, string):
+    """
+    This function removes strings from "behindTag".
+    :param behindTag: urbanonym
+    :param string: forbidden string
+    :return: urbanonym purged from the forbidden string
+    """
+    # if it is at the beginning, then take everything behind it, otherwise just delete
+    if string in behindTag:  # is not at the beginning
+        if behindTag.find(string) != 0:
+            position = behindTag.find(string)
+            behindTag = behindTag[:position]
+        else:  # is at the beginning
+            behindTag = behindTag.replace(string, "")
+    return (behindTag)
+def stringFunc2(behindTag, string):
+    """
+    This function is used to remove strings in "behindTag" if they are at the beginning.
+    :param behindTag: urbanonym
+    :param string: forbidden string
+    :return: urbanonym purged from the forbidden string
+    """
+    if string in behindTag:
+        if behindTag.find(string) == 0:
+            behindTag = behindTag.replace(string, " ")
+    return (behindTag)
+def dataCleaner(dataForCleansing):
+    """
+    This function is used to clean up an urbanoynm.
+    :param dataForCleansing: urbanonym (string)
+    :return: adjusted urbanonym (string)
+    """
+    # clean an urbanonym
+    behindTag = dataForCleansing  # data behind GEDCOM tag "PLAC" (the urbanoynm)
+    behindTag = behindTag.lower()  # behindTag lower cases for better cleansing
+    # cleansing of behindTag
+    # attention: order of cleansing operations is relevant
+    # definition of banned words
+    letters = ["a",
+               "b",
+               "c",
+               "d",
+               "e",
+               "f",
+               "g",
+               "h",
+               "i",
+               "j",
+               "k",
+               "l",
+               "m",
+               "n",
+               "o",
+               "p",
+               "w",
+               "r",
+               "s",
+               "t",
+               "u",
+               "v",
+               "w",
+               "x",
+               "y",
+               "z"
+               ]
+    # exclude the possibility that an abbreviation of a US state appears at the end
+    if behindTag[-4:-2] == ", " and behindTag[-2:-1] in letters and behindTag[-1:] in letters:
+        behindTag = "unrealisticSequenceOfStringsAlpha"
+    # definition of words that must not be included in the urbanonym
+    # banning abbreviations of staats is critial because thats are beginnigs of other places
+    for bannedWords in ["kanada",
+                        "canada",
+                        "america",
+                        "united states",
+                        " usa",
+                        "alabama",
+                        "alaska",
+                        "arizona",
+                        "arkansas",
+                        "california",
+                        "colorado",
+                        "connecticut",
+                        "delaware",
+                        "florida",
+                        "georgia",
+                        "hawaii",
+                        "idaho",
+                        "illinois",
+                        "indiana",
+                        "iowa",
+                        "kansas",
+                        "kentucky",
+                        "louisiana",
+                        "maine",
+                        "maryland",
+                        "massachusetts",
+                        "michigan",
+                        "minnesota",
+                        "mississippi",
+                        "missouri",
+                        "montana",
+                        "nebraska",
+                        "nevada",
+                        "new hapshire",
+                        "new jersey",
+                        "new york",
+                        "north carolina",
+                        "north dakota",
+                        "ohio",
+                        "oklahoma",
+                        "oregon",
+                        "pennsylvania",
+                        "rohde island",
+                        "south carolina",
+                        "south dakota",
+                        "tennessee",
+                        "texas",
+                        "utah",
+                        "vermont",
+                        "virginia",
+                        "washington",
+                        "west virginia",
+                        "wisconsin",
+                        "wyoming",
+                        "england",
+                        "united kingdom",
+                        "australia",
+                        "spain",
+                        "espagne",
+                        "glamorga",
+                        "russia",
+                        "luxembourg",
+                        "scotland",
+                        "irland",
+                        "norway",
+                        "griechenland",
+                        "turkey",
+                        "südafrika",
+                        "brasil",
+                        "france"]:
+        if bannedWords in behindTag:
+            behindTag = "unrealisticSequenceOfStringsAlpha"
+    # definition of words that must not be equal to the urbanonym
+    for bannedWords in ["germany",
+                        "poland",
+                        "france",
+                        "russland"]:  # ausschließlich das, nicht "enthält"
+        if bannedWords == behindTag:
+            behindTag = "unrealisticSequenceOfStringsBeta"
+    # if there is no space behind a dot, it should be added
+    if "." in behindTag:
+        position = behindTag.find(".")
+        if behindTag[position:position + 1] != " ":
+            behindTag = behindTag[:position] + " " + behindTag[position:]
+    # removal of defined strings
+    behindTag = behindTag.replace(">", "")  # remove ">"
+    behindTag = behindTag.replace("<", "")  # remove "<"
+    behindTag = behindTag.replace("_", "")  # remove "_"
+    behindTag = behindTag.replace("'", "")  # remove "'"
+    behindTag = behindTag.replace("rk.", "")  # remove "rk."
+    behindTag = behindTag.replace("ev.", "")  # remove "ev."
+    behindTag = behindTag.replace("waldfriedhof", "")  # remove "("waldfriedhof"
+    behindTag = behindTag.replace("friedhof", "")  # remove "friedhof"
+    behindTag = behindTag.replace("wahrscheinlich", "")  # remove "wahrscheinlich"
+    behindTag = behindTag.replace("aus ", "")  # remove "aus "
+    # remove numbers
+    behindTag = behindTag.replace("0", "")
+    behindTag = behindTag.replace("1", "")
+    behindTag = behindTag.replace("2", "")
+    behindTag = behindTag.replace("3", "")
+    behindTag = behindTag.replace("4", "")
+    behindTag = behindTag.replace("5", "")
+    behindTag = behindTag.replace("6", "")
+    behindTag = behindTag.replace("7", "")
+    behindTag = behindTag.replace("8", "")
+    behindTag = behindTag.replace("9", "")
+    # remove 7-bit ASCII
+    behindTag = behindTag.replace("\xa7", "ß")
+    behindTag = behindTag.replace("\x94", "ö")
+    behindTag = behindTag.replace("\x9a", "ö")
+    behindTag = behindTag.replace("\x8a", "ä")
+    behindTag = behindTag.replace("\x9f", "ü")
+    # removal of further special characters
+    behindTag = behindTag.replace("(?)", "")  # before removing "?", otherwise many problems with empty brackets
+    behindTag = behindTag.replace("?", "")  # often standing alone or behind places
+    behindTag = behindTag.replace(" -", "")  # only with spaces in front, not as hyphen
+    # definition of strings to be removed
+    stringFunc1List = ["standesamt ",
+                       "sta ",
+                       "ksp. ",
+                       "ksp ",
+                       "kirchspiel ",
+                       "kirche ",
+                       "pfarramt ",
+                       "ambt ",
+                       "oder ",
+                       "gemeinde ",
+                       "gmde. ",
+                       "gmde ",
+                       "pfarrei ",
+                       "gericht ",
+                       "ksp. "
+                       ]
+    for i in stringFunc1List:
+        behindTag = stringFunc1(behindTag, i)
+    # definition of strings to be deleted if they are at the beginning
+    stringFunc2List = [" bei ",
+                       " b. ",
+                       " in ",
+                       " im "
+                       ]
+    for i in stringFunc2List:
+        behindTag = stringFunc2(behindTag, i)
+    # writing out abbreviations
+    behindTag = behindTag.replace("berg. ", "bergisch ")  # Example: Bergisch Gladbach
+    behindTag = behindTag.replace("b. ", "bei ")  # Lichtenau b. Ansbach
+    # deletion of not needed content
+    if "jetzt" in behindTag:  # Example: Grone jetzt Göttingen
+        position = behindTag.find(" jetzt")
+        behindTag = behindTag[:position]
+    if "heute" in behindTag:  # Example:
+        position = behindTag.find(" heute")
+        behindTag = behindTag[:position]
+    if " um" in behindTag:  # Example: ... um 12 Uhr
+        position = behindTag.find(" um")
+        behindTag = behindTag[:position]
+    if " bei" in behindTag:  # Example: Lipke bei Landsberg
+        position = behindTag.find(" bei")
+        behindTag = behindTag[:position]
+    if " kr." in behindTag:  # Example: Bronn Kr. Mergentheim
+        position = behindTag.find(" kr.")
+        behindTag = behindTag[:position]
+    if " amt" in behindTag:
+        position = behindTag.find(" amt")
+        behindTag = behindTag[:position]
+    if "/" in behindTag:  # Example: Crossen/Oder
+        position = behindTag.find("/")
+        behindTag = behindTag[:position]
+    while behindTag[:1] == ",":  # delete preceding commas
+        behindTag = behindTag[1:]
+    if "," in behindTag:  # Example: Arendzhain, Kreis Luckau
+        position = behindTag.find(",")
+        behindTag = behindTag[:position]
+    if " in " in behindTag:  # Example: Taufe in Ogkeln
+        position = behindTag.find(" in ")
+        behindTag = behindTag[(position + len(" in ")):]
+    # eliminate double spaces
+    behindTag = behindTag.replace("  ", " ")
+    # eliminate spaces
+    behindTag = behindTag.strip(" ")
+    # overwrite return value
+    dataForCleansing = behindTag
+    return (dataForCleansing)
+def bannedObjects():
+    """
+    This function defines banned object types.
+    Banned object types are object types in the GOV that should not be used for identification.
+    Currently all ecclesiastical objects (up to and including 263) are banned.
+    Currently all legal objects (e.g. courts, from 263) are banned.
+    Currently administrative divisions outside Germany that make allocation difficult (from 257) are banned.
+    List of object types: http://gov.genealogy.net/type/list (retrieved on 8 December 2020)
+    Sometimes there is no English translation of the names of the object types.
+    :return: list of banned object types
+    """
+    return (["124",  # imperial abbey
+             "250",  # Apostolische Administratur
+             "6",  # diocese
+             "91",  # Bistumsregion
+             "9",  # deanery
+             "260",  # Delegaturbezirk
+             "11",  # diocese
+             "12",  # Dompfarrei
+             "13",  # filial church
+             "249",  # Erzbischöfliches Amt
+             "96",  # archbishopric
+             "219",  # Expositur
+             "245",  # chapel
+             "26",  # church
+             "210",  # Kirchenbund
+             "92",  # Kirchengemeinde
+             "27",  # Kirchenkreis
+             "28",  # Kirchenprovinz
+             "29",  # parish
+             "153",  # Kommissariat
+             "35",  # national church
+             "243",  # Propstei
+             "244",  # Nebenkirche
+             "245",  # chapel
+             "249",  # Erzbischöfliches Amt
+             "41",  # Pfarr-Rektorat
+             "42",  # parish
+             "155",  # region
+             "43",  # Pfarrkuratie
+             "44",  # Pfarrverband
+             "155",  # region
+             "206",  # selsoviet
+             "253",  # religious organization
+             "49",  # sprengel
+             "260",  # Delegaturbezirk
+             "263",  # Landratsbezirk
+             "151",  # Oberlandesgericht
+             "105",  # judicial
+             "3",  # Magistrates' Court
+             "223",  # Landgericht
+             "224",  # Pfleggericht
+             "228",  # Gerichtsamt
+             "19",  # Gerichtsbezirk
+             "70",  # bailiwick
+             "79",  # hundred
+             "114",  # Vest
+             "154",  # Honschaft
+             "202",  # Amtsgerichtsbezirk
+             "257",  # Landgemeinde PL
+             "264",  # Mairie
+             "135",  # canton
+             "134",  # arrondissement
+             "25"  # canton
+             ])
+def mainPlaceFinder(data, resultQualityChecker, filename, miniGov):
+    """
+    This function attempts to assign a GOV identifier to each location in a GEDCOM file.
+    :param data: content of one GEDCOM file
+    :param resultQualityChecker: content for one line of the file "quality.csv"
+    :param filename: name of the file/source
+    :param miniGov: list of merged entries of the Mini-GOV
+    :return: list of dictionaries, which contains the identification for each location
+    """
+    # copy the content to avoid compression
+    gedcomMetaInfo = resultQualityChecker
+    # definition of banned object types
+    # banned object types are object types in the GOV that should not be used for identification
+    # currently all ecclesiastical objects (up to and including 263), all legal objects (e.g. courts, from 263) and administrative divisions outside Germany that make allocation difficult (from 257)
+    # list of object types: http://gov.genealogy.net/type/list (retrieved on 8 December 2020)
+    # sometimes there is no English translation of the names of the object types
+    bannedObjectTypes = bannedObjects()
+    # "data" is compromised by the dataCleaner function and could no longer be used
+    # therefore a copy must be created that does not represent a pointer (that's why copy.copy is used)
+    initialGedcomData = copy.copy(data)
+    gedcomData = copy.copy(data)
+    # clean up every urbanonym in a GEDCOM file
+    # clean each row in gedcomData
+    for cleanCounter in range(len(gedcomData)):
+        resultParser = qualitychecker.gedcomRowParser(gedcomData, cleanCounter)  # seperate data of one row
+        tag = resultParser[2]  # GEDCOM tag
+        behindTag = resultParser[3]  # data behind GEDCOM tag
+        behindTag = behindTag.lower()  # behindTag lower cases for better cleansing
+        # for urbanonyms:
+        if tag == "PLAC":
+            dataCleaned = dataCleaner(behindTag)
+            # overwrite the original GEDCOM line with the cleaned text
+            gedcomData[cleanCounter] = resultParser[0] + " " + resultParser[2] + " " + dataCleaned
+    # creation of a list of locations and their sources
+    locList = []
+    for counter, i in enumerate(gedcomData):
+        if i[2:6] == "PLAC":
+            # adjusted urbanonym, original urbanonym, name of file
+            locList.append([i[7:], initialGedcomData[counter][7:], filename])
+    # delete duplicates in a source
+    locList = sorted(set(map(tuple, locList)), reverse=True)
+    # creation of a list containing the identifying data per urbanonym in a dictionary
+    resultList = []
+    for counter, i in enumerate(locList):
+        locNameClean = i[0]  # ubanonym with cleansing
+        locName = i[1]  # urbanoym without cleansing
+        fileName = i[2]
+        # find place
+        resultPlaceFinder = placeFinder(locNameClean,
+                                        miniGov,
+                                        gedcomMetaInfo,
+                                        bannedObjectTypes
+                                        )
+        # create dictionary
+        identifyingInfo = {
+            "id": resultPlaceFinder[0],
+            "latitude": resultPlaceFinder[1],
+            "longitude": resultPlaceFinder[2],
+            "selection information": resultPlaceFinder[3],
+            "adjusted name": locNameClean,
+            "original name": locName,
+            "filename": fileName
+        }
+        resultList.append(identifyingInfo)
+    return (resultList)