From 43edfb8117f247ef4f2ba433a5ca55800073adb2 Mon Sep 17 00:00:00 2001 From: Marcus Baumgarten <baumgarten@hab.de> Date: Wed, 28 Sep 2022 10:55:24 +0000 Subject: [PATCH] Neue Datei hochladen --- provincefinder.py | 404 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 provincefinder.py diff --git a/provincefinder.py b/provincefinder.py new file mode 100644 index 0000000..f1b8eb8 --- /dev/null +++ b/provincefinder.py @@ -0,0 +1,404 @@ +import julian +import placefinder +import time + + +def provincesURI(time): + """ + This function defines the URIs of different regions at different times. + :param time: year to which an administrative assignment should be made + :return: dictionary of GOV object URIs and the textual description of the respective province + """ + # for times before 1872 + if time <= 1871: + return ({ + "object_190122": "A 01 Provinz Holstein", + # Problem in the GOV: The places are not linked to the historical offices, so the province is never found + "adm_131053": "A 02 Provinz Lauenburg", + "object_1081716": "A 03 Provinz Brandenburg (ohne Berlin)", + # if Berlin is meant, it will be recorded before + "object_190330": "A 04 Provinz Hessen-Nassau", + "object_268785": "A 05 Provinz Hohenzollern", + "object_284443": "A 05 Provinz Hohenzollern", + # Hohenzollern-Sigmaringen goes 1850 to Hohenzollerschen Landen + "adm_368500": "A 06 Provinz Ostpreußen", + "adm_368480": "A 07 Provinz Pommern", + "object_211667": "A 08 Provinz Posen", + "object_279654": "A 09 Provinz Sachsen", + "adm_368470": "A 10 Provinz Schlesien", + "object_190325": "A 11 Provinz Westfalen", + "object_213750": "A 12 Provinz Westpreußen", + "object_1047283": "A 13 Rheinprovinz", # Provinz Jülich-Kleve-Berg until 1822 + "object_405464": "A 13 Rheinprovinz", # Provinz Großherzogtum Niederrhein until 1822 + "object_190337": "A 13 Rheinprovinz", + "BERLINJO62PM": "A 14 Provinz Berlin", + "object_257607": "B 01 Amt Bergedorf", + "adm_369040": "B 02 Hansestadt Bremen", + "adm_369020": "B 03 Stadt Hamburg", + "LUBECKJO53IU": "B 04 Stadt Lübeck", + "adm_136412": "B 05 Stadt Frankfurt am Main", + "object_217406": "B 06 Fürstentum Lippe-Detmold", + "object_217818": "B 07 Fürstentum Schaumburg-Lippe", + "object_218152": "B 08 Fürstentum Waldeck-Pyrmont", + "object_352387": "B 09 Großherzogtum Oldenburg", + "object_217952": "B 10 Großherzogtum Baden", + "object_218147": "B 11 Hessen", + "object_217750": "B 12 Großherzogtum Mecklenburg-Schwerin", + "object_217749": "B 13 Großherzogtum Mecklenburg-Strelitz (einschließlich des Fürstentums Ratzeburg)", + "object_190873": "B 14 Herzogtum Anhalt", + "object_217954": "B 15 Herzogtum Braunschweig", + "object_218153": "B 16 Herzogtum Nassau", + "object_190098": "B 17 Herzogtum Schleswig", + "object_190729": "B 18 Königreich Württemberg", + "object_217953": "B 19 Königreich Bayern", + "object_190327": "B 20 Königreich Hannover", + "object_218149": "B 21 Königreich Sachsen", + "object_275299": "B 22 Kurfürstentum Hessen", # here equated with Kurhessen + "object_284442": "B 23 Landgrafschaft Hessen-Homburg", + "": "B 24 Thüringische Staaten", # is divided into many sub-states as follows + "object_218143": "B 24 Thüringische Staaten", # Sachsen-Weimar-Eisenach + "object_284441": "B 24 Thüringische Staaten", # Reuß Jüngere Linie + "object_218134": "B 24 Thüringische Staaten", # Reuß Ältere Linie + "object_218137": "B 24 Thüringische Staaten", # Sachsen-Altenburg + "object_218138": "B 24 Thüringische Staaten", # Sachsen-Coburg-Gotha + "object_265487": "B 24 Thüringische Staaten", # Sachsen Gotha + "object_218142": "B 24 Thüringische Staaten", # Sachsen-Meiningen + "object_218150": "B 24 Thüringische Staaten", # Schwarzburg-Rudolstadt + "object_218151": "B 24 Thüringische Staaten", # Schwarzburg-Sondershausen + "object_218141": "B 24 Thüringische Staaten" # Sachsen-Hildburghausen, has no subordinate objects + }) + # for times after 1989 + elif time >= 1990: + return ({ + "BERLINJO62PM": "Land Berlin", + "object_218149": "Freistaat Sachsen", + "adm_369080": "Land Baden-Württemberg", + "adm_369090": "Freistaat Bayern", + "adm_369120": "Land Brandenburg", + "adm_369040": "Freie Hansestadt Bremen", + "object_1259992": "Freie und Hansestadt Hamburg", + "adm_369060": "Land Hessen", + "adm_369130": "Land Mecklenburg-Vorpommern", + "adm_369030": "Land Niedersachsen", + "adm_369050": "Land Nordrhein-Westfalen", + "adm_369070": "Land Rheinland-Pfalz", + "adm_369100": "Saarland", + "adm_369150": "Land Sachsen-Anhalt", + "adm_369010": "Land Schleswig-Holstein", + "adm_369160": "Freistaat Thüringen" + }) + + +def provinceFinder(govid, referenceYear, client): + """ + This function determines the historical-administrative affiliation to an object at a given time. + :param govid: GOV identifier (string) + :param referenceYear: year to which an administrative assignment should be made + :param client: connection to the GOV-Webservice + :return: province or "None" + """ + # if this variable is 1, the program will be stopped for one second in case of internet connection failures + # this prevents an abort of the program due to internet problems, but leads to a longer runtime + withSleeping = 0 + + # definition of prohibited object types + bannedObjectTypes = placefinder.bannedObjects() + # assignment of objects to be found and historical-administrative units + provinces = provincesURI(referenceYear) + + # if GOV identifier empty, then return None + if govid == "": + return ("None") + + govidBefore = 0 # Initialisierung + # following loop jumps one level up in the membership tree per iteration + # number of 10 is currently chosen arbitrarily, in the hope that no tree has more levels + for ab in range(0, 10): + # here, possible superordinate objects are included, which are appropriate in time + govidsList = [] # list A, priority + # List B (non priority) is required if no object fits so well that it is included in List A. + nonPrioGovidsList = [] # list B, non priority + # termination condition: if the same object is examined twice in a row, then abort + # query is used to improve the runtime, so that the same object is not searched max. 10 times + if govid == govidBefore: + print("Error: Object can no longer take a meaningful step (GOV-ID, GOV-ID before):", govid, govidBefore) + break + # since "govid" changes, the previous one must be cached + govidBefore = govid + # check if the object already matches a province + try: # if yes, then there is no KeyError + province = provinces[govid] + return (province) + except KeyError: + # information about the object is obtained from the web service (a dictionary that is composed of dictionary) + if withSleeping == 1: + for run in range(1000): + try: + govidInfo = callWebservice(govid, client) + except: # if the connection is just gone the program should not crash + time.sleep(1) + print("Status: Sleeping for 1 s.") + if run == 999: + print("Status: Connection error") + else: + govidInfo = callWebservice(govid, client) + # from this the entry "part-of" is required + govidInfoSuperior = govidInfo['part-of'] + # if "part-of" is empty, then the info is in "located-in" if necessary + if len(govidInfoSuperior) == 0: + govidInfoSuperior = govidInfo["located-in"] + + # every superior object is now searched + # The date can be in three places: 1. in timespan (), in begin-year, end-year, 3. in year + for superior in range(len(govidInfoSuperior)): + # if timespan available + # if timespan is not None, use the years from it + if govidInfoSuperior[superior]["timespan"] is not None: + yearBegin = begincalculator(govidInfoSuperior[superior]) + yearEnd = endcalculator(govidInfoSuperior[superior]) + # check if the timespan matches the searched time + # if yes a list is extended + if yearBegin <= referenceYear and yearEnd >= referenceYear: + govid = govidInfoSuperior[superior]["ref"] + if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes: + govidsList.append(govid) + else: + if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][ + "value"] not in bannedObjectTypes: + nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"]) + # if timespan not available + else: + try: + # begin is determined + if govidInfoSuperior[superior]["begin-year"] is None: + yearBegin = 1 # sets begin to year 1 + else: + yearBegin = govidInfoSuperior[superior]["begin-year"] + # end is determined + if govidInfoSuperior[superior]["end-year"] is None: + yearEnd = 9999 # set end to year 9999 + else: + yearEnd = govidInfoSuperior[superior]["end-year"] + # if an object has an assumed time (start 1, end 9999), then always list B (problem otherwise e.g. with KIRORFJO40NS, adm_137138) + if yearBegin == 1 or yearEnd == 9999: + if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][ + "value"] not in bannedObjectTypes: + nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"]) + # comparison with reference time + elif yearBegin <= referenceYear and yearEnd >= referenceYear: + govid = govidInfoSuperior[superior]["ref"] + if callWebservice(govid, client)["type"][0]["value"] not in bannedObjectTypes: + govidsList.append(govid) + else: + if callWebservice(govidInfoSuperior[superior]["ref"], client)["type"][0][ + "value"] not in bannedObjectTypes: + nonPrioGovidsList.append(govidInfoSuperior[superior]["ref"]) + except TypeError: + print( + "Error: A problem has occurred in the calculation of time spans. Presumably there are letters as numbers:", + print(govidInfoSuperior[superior])) + pass; + # if one of the objects in list A or B is one of the target objects, then take the + for i in govidsList: # list A + try: + province = provinces[i] + return (province) # Search was successful! + except KeyError: + continue + for i in nonPrioGovidsList: # list B + try: + province = provinces[i] + return (province) # Search was successful! + except KeyError: + continue + # if list A is empty, then list B should be used + if len(govidsList) == 0: + # if list B is also empty, then you should try to fill it further + if len(nonPrioGovidsList) == 0: # Example: Case LIEHA2JO62RV, which has no part-of + for a in range(len(govidInfoSuperior)): + # the type of the following object is of interest (not the previous one) + if callWebservice(govidInfoSuperior[a]["ref"], client)["type"][0][ + "value"] not in bannedObjectTypes: + nonPrioGovidsList.append(govidInfoSuperior[a]["ref"]) + govidsList = nonPrioGovidsList + + # rate objects in list A or B + + # delete duplicate values + # duplicate affiliations to the same object at different times may exist (e.g. adm_144024), but this is recognized below + govidsList = list(set(govidsList)) + + # if list contains only one object, then this is the appropriate one to perform the next iteration + if len(govidsList) == 1: + govid = govidsList[0] + # if list contains no object, then cancel + elif len(govidsList) == 0: + # mandatory abort, because no object could be determined to perform the next iteration + break; + else: # case where list contains more than one value + closerInTime = [] # initialization + # each object in the list is checked to see how close the time limits are to the reference time + for elementGovidsList in govidsList: + # a simple list comprehension to find the index is inappropriate, since the searched value can occur several times + # therefore a list is created + indexList = [] # results are stored in this list + for counter, resultPartOf in enumerate(govidInfoSuperior): + if resultPartOf["ref"] == elementGovidsList: + indexList.append(counter) + if len(indexList) == 0: + index = None + print("Error: The object name does not occur.") + for index in indexList: + if govidInfoSuperior[index][ + "timespan"] is not None: # if timespan is given, then it is more detailed + yearBegin = begincalculator(govidInfoSuperior[index]) + yearEnd = endcalculator(govidInfoSuperior[index]) + # if only one year, but no begin or end + elif govidInfoSuperior[index]["begin-year"] is None and \ + govidInfoSuperior[index]["end-year"] is None and \ + govidInfoSuperior[index]["year"] is not None: + yearBegin = govidInfoSuperior[index]["year"] + yearEnd = govidInfoSuperior[index]["year"] + else: # if no timespan + yearBegin = govidInfoSuperior[index]["begin-year"] + if yearBegin is None: # if there is no value + yearBegin = 1 + yearEnd = govidInfoSuperior[index]["end-year"] + if yearEnd is None: + yearEnd = 9999 + diffBegin = abs(yearBegin - referenceYear) + diffEnd = abs(yearEnd - referenceYear) + clusterDict = { + "object": elementGovidsList, + "diffbegin": diffBegin, + "diffend": diffEnd, + "begin-year": yearBegin, + "end-year": yearEnd + } + closerInTime.append(clusterDict) # list of dictionaries + diff = 9999 # initialization + # In the following it is examined which of the chronologically obvious results is the closest in time. + # it is irrelevant whether the difference lies before or after the reference time + for counter, i in enumerate(closerInTime): + # Equal comparisons are critical in cases where time limits overlap (e.g. object_289942 --> until 1920, since 1920) + if int(i["diffbegin"]) < diff: + diff = int(i["diffbegin"]) + closestInTime = counter + elif int(i["diffbegin"]) == diff: + # search the absolute value of the start (not the difference) + yearBegin = i["begin-year"] + # if reference period is smaller than diffbegin + if referenceYear <= yearBegin: + # if it is "begin" and the other "end", then take the one with the end + # if the previous is no end (then neither + nor - 0), then take after new + if (closerInTime[closestInTime]["diffend"] + diff) != 0 and ( + closerInTime[closestInTime]["diffend"] - diff) != 0: + closestInTime = counter + # larger + elif referenceYear > yearBegin: + # if the previous one is no beginning (then neither + nor - 0), then move to new one + if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and ( + closerInTime[closestInTime]["diffbegin"] - diff) != 0: + closestInTime = counter + if int(i["diffend"]) < diff: + diff = int(i["diffend"]) + closestInTime = counter + elif int(i["diffend"]) == diff: + # search the beginning of the year + yearEnd = i["end-year"] + # if reference period smaller than diffbegin + if referenceYear <= yearEnd: + # take this if the previous (closestInTime) is a start or no end + if (closerInTime[closestInTime]["diffend"] + diff) != 0 and ( + closerInTime[closestInTime]["diffend"] - diff) != 0: + closestInTime = counter + # larger + elif referenceYear > yearEnd: + # take this if the previous one is not a beginning + if (closerInTime[closestInTime]["diffbegin"] + diff) != 0 and ( + closerInTime[closestInTime]["diffbegin"] - diff) != 0: + closestInTime = counter + # object with the closest reference time is selected + # if the reference time is the same, the last object is selected (<=) + # Reason: In several regularly occurring special cases (e.g. some places in Poznan) the right one is rather behind + govid = closerInTime[closestInTime]["object"] + return ("None") + + +def callWebservice(govid, client): + """ + This function calls the GOV webservice. + Eine Internetverbindung ist notwendig. + :param govid: GOV identifier + :param client: connection to the GOV-Webservice + :return: information of the GOV about the corresponding GOV identifier + """ + gotObject = client.service.getObject(govid) + return (gotObject) + + +def begincalculator(data): + """ + This function converts the timespan data of an object (available as Julian date) into a year number, which describes the beginning of the affiliation. + :param data: time information about administrative affiliations + :return: year as integer + """ + timespan = data["timespan"] + if timespan["begin"] is not None: + begin = timespan["begin"] + jd = begin["jd"] - 2400000 # julian date + yearBegin = julian.from_jd(jd, fmt='mjd') + yearBegin = int(yearBegin.year) # must be int to compare it + else: + yearBegin = 1 # then set the start to a very early year + return (yearBegin) + + +def endcalculator(data): + """ + This function converts the timespan data of an object (available as Julian date) into a year number, which describes the end of membership. + :param data: time information about administrative affiliations + :return: year as integer + """ + timespan = data["timespan"] + if timespan["end"] is not None: + end = timespan["end"] + jd = end["jd"] - 2400000 # julian date + yearEnd = julian.from_jd(jd, fmt='mjd') + yearEnd = int(yearEnd.year) # must be int to compare it + else: + yearEnd = 9999 # then set the end to a very late year + return (yearEnd) + + +def mainProvinceFinder(resultPlaceFinder, filename, client, time): + """ + This function assigns the identified urban names to a historical province. + :param resultPlaceFinder: list of dictionaries, which contains the identification for each location + :param filename: name of the file/source + :param client: connection to the GOV-Webservice + :param time: year to which an administrative assignment should be made + :return: list of dictionaries containing urbanonym, source, GOV-identifier and assigned provinces + """ + + # perform clustering for each urbanonym of the identification + provincesDictList = [] + for counter, i in enumerate(resultPlaceFinder): + # only edit entries that match the source + if i["filename"] != filename: + continue; # only happens with data loaded from CSV + govid = i["id"] # GOV identifier + # if identification has failed, then clustering cannot be successful + if govid != "NONE": + # trigger clustering if identification is successful + resultProvinceFinder = provinceFinder(govid, time, client) + else: + resultProvinceFinder = "NONE" + provincesDict = { + "original name": i["original name"], + "filename": i["filename"], + "id": govid, + "province": resultProvinceFinder + } + provincesDictList.append(provincesDict) + return (provincesDictList) -- GitLab