From 43edfb8117f247ef4f2ba433a5ca55800073adb2 Mon Sep 17 00:00:00 2001
From: Marcus Baumgarten <baumgarten@hab.de>
Date: Wed, 28 Sep 2022 10:55:24 +0000
Subject: [PATCH] Neue Datei hochladen

---
 provincefinder.py | 404 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 404 insertions(+)
 create mode 100644 provincefinder.py

diff --git a/provincefinder.py b/provincefinder.py
new file mode 100644
index 0000000..f1b8eb8
--- /dev/null
+++ b/provincefinder.py
@@ -0,0 +1,404 @@
+import julian
+import placefinder
+import time
+
+
+def provincesURI(time):
+    """
+    This function defines the URIs of different regions at different times.
+    :param time: year to which an administrative assignment should be made
+    :return: dictionary of GOV object URIs and the textual description of the respective province
+    """
+    # for times before 1872
+    if time <= 1871:
+        return ({
+            "object_190122": "A 01 Provinz Holstein",
+            # Problem in the GOV: The places are not linked to the historical offices, so the province is never found
+            "adm_131053": "A 02 Provinz Lauenburg",
+            "object_1081716": "A 03 Provinz Brandenburg (ohne Berlin)",
+            # if Berlin is meant, it will be recorded before
+            "object_190330": "A 04 Provinz Hessen-Nassau",
+            "object_268785": "A 05 Provinz Hohenzollern",
+            "object_284443": "A 05 Provinz Hohenzollern",
+            # Hohenzollern-Sigmaringen goes 1850 to Hohenzollerschen Landen
+            "adm_368500": "A 06 Provinz Ostpreußen",
+            "adm_368480": "A 07 Provinz Pommern",
+            "object_211667": "A 08 Provinz Posen",
+            "object_279654": "A 09 Provinz Sachsen",
+            "adm_368470": "A 10 Provinz Schlesien",
+            "object_190325": "A 11 Provinz Westfalen",
+            "object_213750": "A 12 Provinz Westpreußen",
+            "object_1047283": "A 13 Rheinprovinz",  # Provinz Jülich-Kleve-Berg until 1822
+            "object_405464": "A 13 Rheinprovinz",  # Provinz Großherzogtum Niederrhein until 1822
+            "object_190337": "A 13 Rheinprovinz",
+            "BERLINJO62PM": "A 14 Provinz Berlin",
+            "object_257607": "B 01 Amt Bergedorf",
+            "adm_369040": "B 02 Hansestadt Bremen",
+            "adm_369020": "B 03 Stadt Hamburg",
+            "LUBECKJO53IU": "B 04 Stadt Lübeck",
+            "adm_136412": "B 05 Stadt Frankfurt am Main",
+            "object_217406": "B 06 Fürstentum Lippe-Detmold",
+            "object_217818": "B 07 Fürstentum Schaumburg-Lippe",
+            "object_218152": "B 08 Fürstentum Waldeck-Pyrmont",
+            "object_352387": "B 09 Großherzogtum Oldenburg",
+            "object_217952": "B 10 Großherzogtum Baden",
+            "object_218147": "B 11 Hessen",
+            "object_217750": "B 12 Großherzogtum Mecklenburg-Schwerin",
+            "object_217749": "B 13 Großherzogtum Mecklenburg-Strelitz (einschließlich des Fürstentums Ratzeburg)",
+            "object_190873": "B 14 Herzogtum Anhalt",
+            "object_217954": "B 15 Herzogtum Braunschweig",
+            "object_218153": "B 16 Herzogtum Nassau",
+            "object_190098": "B 17 Herzogtum Schleswig",
+            "object_190729": "B 18 Königreich Württemberg",
+            "object_217953": "B 19 Königreich Bayern",
+            "object_190327": "B 20 Königreich Hannover",
+            "object_218149": "B 21 Königreich Sachsen",
+            "object_275299": "B 22 Kurfürstentum Hessen",  # here equated with Kurhessen
+            "object_284442": "B 23 Landgrafschaft Hessen-Homburg",
+            "": "B 24 Thüringische Staaten",  # is divided into many sub-states as follows
+            "object_218143": "B 24 Thüringische Staaten",  # Sachsen-Weimar-Eisenach
+            "object_284441": "B 24 Thüringische Staaten",  # Reuß Jüngere Linie
+            "object_218134": "B 24 Thüringische Staaten",  # Reuß Ältere Linie
+            "object_218137": "B 24 Thüringische Staaten",  # Sachsen-Altenburg
+            "object_218138": "B 24 Thüringische Staaten",  # Sachsen-Coburg-Gotha
+            "object_265487": "B 24 Thüringische Staaten",  # Sachsen Gotha
+            "object_218142": "B 24 Thüringische Staaten",  # Sachsen-Meiningen
+            "object_218150": "B 24 Thüringische Staaten",  # Schwarzburg-Rudolstadt
+            "object_218151": "B 24 Thüringische Staaten",  # Schwarzburg-Sondershausen
+            "object_218141": "B 24 Thüringische Staaten"  # Sachsen-Hildburghausen, has no subordinate objects
+        })
+    # for times after 1989
+    elif time >= 1990:
+        return ({
+            "BERLINJO62PM": "Land Berlin",
+            "object_218149": "Freistaat Sachsen",
+            "adm_369080": "Land Baden-Württemberg",
+            "adm_369090": "Freistaat Bayern",
+            "adm_369120": "Land Brandenburg",
+            "adm_369040": "Freie Hansestadt Bremen",
+            "object_1259992": "Freie und Hansestadt Hamburg",
+            "adm_369060": "Land Hessen",
+            "adm_369130": "Land Mecklenburg-Vorpommern",
+            "adm_369030": "Land Niedersachsen",
+            "adm_369050": "Land Nordrhein-Westfalen",
+            "adm_369070": "Land Rheinland-Pfalz",
+            "adm_369100": "Saarland",
+            "adm_369150": "Land Sachsen-Anhalt",
+            "adm_369010": "Land Schleswig-Holstein",
+            "adm_369160": "Freistaat Thüringen"
+        })
+
+
+def provinceFinder(govid, referenceYear, client):
+    """
+    This function determines the historical-administrative affiliation to an object at a given time.
+    :param govid: GOV identifier (string)
+    :param referenceYear: year to which an administrative assignment should be made
+    :param client: connection to the GOV-Webservice
+    :return: province or "None"
+    """
+    # if this variable is 1, the program will be stopped for one second in case of internet connection failures
+    # this prevents an abort of the program due to internet problems, but leads to a longer runtime
+    withSleeping = 0
+
+    # definition of prohibited object types
+    bannedObjectTypes = placefinder.bannedObjects()
+    # assignment of objects to be found and historical-administrative units
+    provinces = provincesURI(referenceYear)
+
+    # if GOV identifier empty, then return None
+    if govid == "":
+        return ("None")
+
+    govidBefore = 0  # Initialisierung
+    # following loop jumps one level up in the membership tree per iteration
+    # number of 10 is currently chosen arbitrarily, in the hope that no tree has more levels
+    for ab in range(0, 10):
+        # here, possible superordinate objects are included, which are appropriate in time
+        govidsList = []  # list A, priority
+        # List B (non priority) is required if no object fits so well that it is included in List A.
+        nonPrioGovidsList = []  # list B, non priority
+        # termination condition: if the same object is examined twice in a row, then abort
+        # query is used to improve the runtime, so that the same object is not searched max. 10 times
+        if govid == govidBefore:
+            print("Error: Object can no longer take a meaningful step (GOV-ID, GOV-ID before):", govid, govidBefore)
+            break
+        # since "govid" changes, the previous one must be cached
+        govidBefore = govid
+        # check if the object already matches a province
+        try:  # if yes, then there is no KeyError
+            province = provinces[govid]
+            return (province)
+        except KeyError:
+            # information about the object is obtained from the web service (a dictionary that is composed of dictionary)
+            if withSleeping == 1:
+                for run in range(1000):
+                    try:
+                        govidInfo = callWebservice(govid, client)
+                    except: # if the connection is just gone the program should not crash
+                        time.sleep(1)
+                        print("Status: Sleeping for 1 s.")
+                        if run == 999:
+                            print("Status: Connection error")
+            else:
+                govidInfo = callWebservice(govid, client)
+            # from this the entry "part-of" is required
+            govidInfoSuperior = govidInfo['part-of']
+            # if "part-of" is empty, then the info is in "located-in" if necessary
+            if len(govidInfoSuperior) == 0:
+                govidInfoSuperior = govidInfo["located-in"]
+
+            # every superior object is now searched
+            # The date can be in three places: 1. in timespan (), in begin-year, end-year, 3. in year
+            for superior in range(len(govidInfoSuperior)):
+                # if timespan available
+                # if timespan is not None, use the years from it
+                if govidInfoSuperior[superior]["timespan"] is not None:
+                    yearBegin = begincalculator(govidInfoSuperior[superior])
+                    yearEnd = endcalculator(govidInfoSuperior[superior])
+                    # check if the timespan matches the searched time
+                    # if yes a list is extended
+                    if yearBegin <= referenceYear and yearEnd >= referenceYear:
+                        govid = govidInfoSuperior[superior]["ref"]
+                        if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes:
+                            govidsList.append(govid)
+                    else:
+                        if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                            "value"] not in bannedObjectTypes:
+                            nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                # if timespan not available
+                else:
+                    try:
+                        # begin is determined
+                        if govidInfoSuperior[superior]["begin-year"] is None:
+                            yearBegin = 1  # sets begin to year 1
+                        else:
+                            yearBegin = govidInfoSuperior[superior]["begin-year"]
+                        # end is determined
+                        if govidInfoSuperior[superior]["end-year"] is None:
+                            yearEnd = 9999  # set end to year 9999
+                        else:
+                            yearEnd = govidInfoSuperior[superior]["end-year"]
+                        # if an object has an assumed time (start 1, end 9999), then always list B (problem otherwise e.g. with KIRORFJO40NS, adm_137138)
+                        if yearBegin == 1 or yearEnd == 9999:
+                            if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                                "value"] not in bannedObjectTypes:
+                                nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                        # comparison with reference time
+                        elif yearBegin <= referenceYear and yearEnd >= referenceYear:
+                            govid = govidInfoSuperior[superior]["ref"]
+                            if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes:
+                                govidsList.append(govid)
+                        else:
+                            if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][
+                                "value"] not in bannedObjectTypes:
+                                nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"])
+                    except TypeError:
+                        print(
+                            "Error: A problem has occurred in the calculation of time spans. Presumably there are letters as numbers:",
+                            print(govidInfoSuperior[superior]))
+                        pass;
+            # if one of the objects in list A or B is one of the target objects, then take the
+            for i in govidsList:  # list A
+                try:
+                    province = provinces[i]
+                    return (province)  # Search was successful!
+                except KeyError:
+                    continue
+            for i in nonPrioGovidsList:  # list B
+                try:
+                    province = provinces[i]
+                    return (province)  # Search was successful!
+                except KeyError:
+                    continue
+            # if list A is empty, then list B should be used
+            if len(govidsList) == 0:
+                # if list B is also empty, then you should try to fill it further
+                if len(nonPrioGovidsList) == 0:  # Example: Case LIEHA2JO62RV, which has no part-of
+                    for a in range(len(govidInfoSuperior)):
+                        # the type of the following object is of interest (not the previous one)
+                        if callWebservice(govidInfoSuperior[a]["ref"], client)["type"][0][
+                            "value"] not in bannedObjectTypes:
+                            nonPrioGovidsList.append(govidInfoSuperior[a]["ref"])
+                govidsList = nonPrioGovidsList
+
+            # rate objects in list A or B
+
+            # delete duplicate values
+            # duplicate affiliations to the same object at different times may exist (e.g. adm_144024), but this is recognized below
+            govidsList = list(set(govidsList))
+
+            # if list contains only one object, then this is the appropriate one to perform the next iteration
+            if len(govidsList) == 1:
+                govid = govidsList[0]
+            # if list contains no object, then cancel
+            elif len(govidsList) == 0:
+                # mandatory abort, because no object could be determined to perform the next iteration
+                break;
+            else:  # case where list contains more than one value
+                closerInTime = []  # initialization
+                # each object in the list is checked to see how close the time limits are to the reference time
+                for elementGovidsList in govidsList:
+                    # a simple list comprehension to find the index is inappropriate, since the searched value can occur several times
+                    # therefore a list is created
+                    indexList = []  # results are stored in this list
+                    for counter, resultPartOf in enumerate(govidInfoSuperior):
+                        if resultPartOf["ref"] == elementGovidsList:
+                            indexList.append(counter)
+                    if len(indexList) == 0:
+                        index = None
+                        print("Error: The object name does not occur.")
+                    for index in indexList:
+                        if govidInfoSuperior[index][
+                            "timespan"] is not None:  # if timespan is given, then it is more detailed
+                            yearBegin = begincalculator(govidInfoSuperior[index])
+                            yearEnd = endcalculator(govidInfoSuperior[index])
+                        # if only one year, but no begin or end
+                        elif govidInfoSuperior[index]["begin-year"] is None and \
+                                govidInfoSuperior[index]["end-year"] is None and \
+                                govidInfoSuperior[index]["year"] is not None:
+                            yearBegin = govidInfoSuperior[index]["year"]
+                            yearEnd = govidInfoSuperior[index]["year"]
+                        else:  # if no timespan
+                            yearBegin = govidInfoSuperior[index]["begin-year"]
+                            if yearBegin is None:  # if there is no value
+                                yearBegin = 1
+                            yearEnd = govidInfoSuperior[index]["end-year"]
+                            if yearEnd is None:
+                                yearEnd = 9999
+                        diffBegin = abs(yearBegin - referenceYear)
+                        diffEnd = abs(yearEnd - referenceYear)
+                        clusterDict = {
+                            "object": elementGovidsList,
+                            "diffbegin": diffBegin,
+                            "diffend": diffEnd,
+                            "begin-year": yearBegin,
+                            "end-year": yearEnd
+                        }
+                        closerInTime.append(clusterDict)  # list of dictionaries
+                diff = 9999  # initialization
+                # In the following it is examined which of the chronologically obvious results is the closest in time.
+                # it is irrelevant whether the difference lies before or after the reference time
+                for counter, i in enumerate(closerInTime):
+                    # Equal comparisons are critical in cases where time limits overlap (e.g. object_289942 --> until 1920, since 1920)
+                    if int(i["diffbegin"]) < diff:
+                        diff = int(i["diffbegin"])
+                        closestInTime = counter
+                    elif int(i["diffbegin"]) == diff:
+                        # search the absolute value of the start (not the difference)
+                        yearBegin = i["begin-year"]
+                        # if reference period is smaller than diffbegin
+                        if referenceYear <= yearBegin:
+                            # if it is "begin" and the other "end", then take the one with the end
+                            # if the previous is no end (then neither + nor - 0), then take after new
+                            if (closerInTime[closestInTime]["diffend"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffend"] - diff) != 0:
+                                closestInTime = counter
+                        # larger
+                        elif referenceYear > yearBegin:
+                            # if the previous one is no beginning (then neither + nor - 0), then move to new one
+                            if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffbegin"] - diff) != 0:
+                                closestInTime = counter
+                    if int(i["diffend"]) < diff:
+                        diff = int(i["diffend"])
+                        closestInTime = counter
+                    elif int(i["diffend"]) == diff:
+                        # search the beginning of the year
+                        yearEnd = i["end-year"]
+                        # if reference period smaller than diffbegin
+                        if referenceYear <= yearEnd:
+                            # take this if the previous (closestInTime) is a start or no end
+                            if (closerInTime[closestInTime]["diffend"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffend"] - diff) != 0:
+                                closestInTime = counter
+                        # larger
+                        elif referenceYear > yearEnd:
+                            # take this if the previous one is not a beginning
+                            if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and (
+                                    closerInTime[closestInTime]["diffbegin"] - diff) != 0:
+                                closestInTime = counter
+                                # object with the closest reference time is selected
+                # if the reference time is the same, the last object is selected (<=)
+                # Reason: In several regularly occurring special cases (e.g. some places in Poznan) the right one is rather behind
+                govid = closerInTime[closestInTime]["object"]
+    return ("None")
+
+
+def callWebservice(govid, client):
+    """
+    This function calls the GOV webservice.
+    Eine Internetverbindung ist notwendig.
+    :param govid: GOV identifier
+    :param client: connection to the GOV-Webservice
+    :return: information of the GOV about the corresponding GOV identifier
+    """
+    gotObject = client.service.getObject(govid)
+    return (gotObject)
+
+
+def begincalculator(data):
+    """
+    This function converts the timespan data of an object (available as Julian date) into a year number, which describes the beginning of the affiliation.
+    :param data: time information about administrative affiliations
+    :return: year as integer
+    """
+    timespan = data["timespan"]
+    if timespan["begin"] is not None:
+        begin = timespan["begin"]
+        jd = begin["jd"] - 2400000  # julian date
+        yearBegin = julian.from_jd(jd, fmt='mjd')
+        yearBegin = int(yearBegin.year)  # must be int to compare it
+    else:
+        yearBegin = 1  # then set the start to a very early year
+    return (yearBegin)
+
+
+def endcalculator(data):
+    """
+    This function converts the timespan data of an object (available as Julian date) into a year number, which describes the end of membership.
+    :param data: time information about administrative affiliations
+    :return: year as integer
+    """
+    timespan = data["timespan"]
+    if timespan["end"] is not None:
+        end = timespan["end"]
+        jd = end["jd"] - 2400000  # julian date
+        yearEnd = julian.from_jd(jd, fmt='mjd')
+        yearEnd = int(yearEnd.year)  # must be int to compare it
+    else:
+        yearEnd = 9999  # then set the end to a very late year
+    return (yearEnd)
+
+
+def mainProvinceFinder(resultPlaceFinder, filename, client, time):
+    """
+    This function assigns the identified urban names to a historical province.
+    :param resultPlaceFinder: list of dictionaries, which contains the identification for each location
+    :param filename: name of the file/source
+    :param client: connection to the GOV-Webservice
+    :param time: year to which an administrative assignment should be made
+    :return: list of dictionaries containing urbanonym, source, GOV-identifier and assigned provinces
+    """
+
+    # perform clustering for each urbanonym of the identification
+    provincesDictList = []
+    for counter, i in enumerate(resultPlaceFinder):
+        # only edit entries that match the source
+        if i["filename"] != filename:
+            continue;  # only happens with data loaded from CSV
+        govid = i["id"]  # GOV identifier
+        # if identification has failed, then clustering cannot be successful
+        if govid != "NONE":
+            # trigger clustering if identification is successful
+            resultProvinceFinder = provinceFinder(govid, time, client)
+        else:
+            resultProvinceFinder = "NONE"
+        provincesDict = {
+            "original name": i["original name"],
+            "filename": i["filename"],
+            "id": govid,
+            "province": resultProvinceFinder
+        }
+        provincesDictList.append(provincesDict)
+    return (provincesDictList)
-- 
GitLab