Neue Datei hochladen

715e1270 · Marcus Baumgarten · 413832e4 · 715e1270
Commit 715e1270 authored 2 years ago by Marcus Baumgarten
--- a/normform_KLK.py
+++ b/normform_KLK.py
+import csv
+import json
+import re
+# load data
+encoding = "utf-8"
+delimiter = "\t"
+data = []
+with open("leipzig_testamente-gesamt.csv", "r", encoding=encoding) as file:
+    for i in csv.DictReader(file, delimiter=delimiter):
+        i = json.loads(json.dumps(i))
+        data.append(i)
+def surnameSeperator(lastname, sex):
+    """
+    This function breaks the last name into its components.
+    :param lastname: surname (string)
+    :param sex: sex, binary (string)
+    :return: overview of change of surname  (dictionary)
+    """
+    # initialization of the variables
+    surnameMarriage1 = ""
+    surnameMarriage2 = ""
+    surnameMarriage3 = ""
+    surnameGiven = ""
+    surnameUnknown = ""
+    # parsing the surname
+    # recognize the birth name
+    if "geb." in lastname:
+        surnameGiven = lastname[lastname.find("geb.") + len("geb.") + 1:]
+    elif "Geb." in lastname:
+        surnameGiven = lastname[lastname.find("Geb.") + len("Geb.") + 1:]
+    # recognize the married name
+    elif "verw." in lastname:
+        if surnameMarriage1 == "":
+            surnameMarriage1 = lastname[lastname.find("verw.") + len("verw.") + 1:]
+        elif surnameMarriage2 == "":
+            surnameMarriage2 = lastname[lastname.find("verw.") + len("verw.") + 1:]
+        elif surnameMarriage3 == "":
+            surnameMarriage3 = lastname[lastname.find("verw.") + len("verw.") + 1:]
+    elif "verehel." in lastname:
+        if surnameMarriage1 == "":
+            surnameMarriage1 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
+        elif surnameMarriage2 == "":
+            surnameMarriage2 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
+        elif surnameMarriage3 == "":
+            surnameMarriage3 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
+        surnameGiven = lastname[:lastname.find("verehel.")]
+        if surnameGiven[-1:] == " ":
+            surnameGiven = surnameGiven[:-1]
+        if surnameGiven[-1:] == ",":
+            surnameGiven = surnameGiven[:-1]
+    elif "Verehe." in lastname:
+        surnameActual = lastname[lastname.find("Verehe.") + len("Verehe.") + 1:]
+        surnameGiven = lastname[:lastname.find("Verehe.")]
+        if surnameGiven[-1:] == " ":
+            surnameGiven = surnameGiven[:-1]
+        if surnameGiven[-1:] == ",":
+            surnameGiven = surnameGiven[:-1]
+    else:
+        # for men the surname it is always the birth name
+        if sex == "M":
+            surnameGiven = lastname
+        # for women this is not clear
+        else:  # female
+            surnameUnknown = lastname
+    # dictionary creation
+    surnameDict = {
+        "surnameMarriage1": surnameMarriage1,
+        "surnameMarriage2": surnameMarriage2,
+        "surnameMarriage3": surnameMarriage3,
+        "surnameGiven": surnameGiven,
+        "surnameUnknown": surnameUnknown}
+    return (surnameDict)
+# correction of the sources
+# sometimes separates numbers with ";" instead of ",", e.g. Bd. 34 Bl. 23-24; 29
+for n1, i in enumerate(data):
+    # remove semicolons at the end
+    if i["Band und Blatt"][-1:] == ";":
+        i["Band und Blatt"] = i["Band und Blatt"][:-1]
+    elif i["Band und Blatt"][-2:] == "; ":
+        i["Band und Blatt"] = i["Band und Blatt"][:-2]
+    # if there is no capital "B" after a semicolon, make it a comma
+    positionList = []
+    for n2, character in enumerate(i["Band und Blatt"]):
+        if character == ";":
+            if i["Band und Blatt"][n2 + 2] != "B":
+                data[n1]["Band und Blatt"] = data[n1]["Band und Blatt"][:n2] + "," + data[n1]["Band und Blatt"][n2 + 1:]
+# add empty columns
+for rowNumber, i in enumerate(data):
+    keysAppend = ["idSpouse1",
+                  "idSpouse2",
+                  "idSpouse3",
+                  "idFather",
+                  "idMother"]
+    for key in keysAppend:
+        try:
+            data[rowNumber][key]
+        except KeyError:
+            data[rowNumber].update({key: ""})
+            # addition of variables that are irrelevant here but occur in the norm form
+    addVarList = ["firstnameChange",
+                  "surnameChange",
+                  "birthday",
+                  "birthplace",
+                  "birthplaceGOV",
+                  "growthUpPlace",
+                  "growthUpPlaceGOV",
+                  "baptismday",
+                  "baptismplace",
+                  "baptismplaceGOV",
+                  "marriageday1",
+                  "marriageday2",
+                  "marriageday3",
+                  "marriageplace1",
+                  "marriageplace2",
+                  "marriageplace3",
+                  "marriageplaceGOV1",
+                  "marriageplaceGOV2",
+                  "marriageplaceGOV3",
+                  "ageAtMarriage1",
+                  "ageAtMarriage2",
+                  "ageAtMarriage3",
+                  "divorceday1",
+                  "divorceday2",
+                  "divorceday3",
+                  "deathplace",
+                  "deathplaceGOV",
+                  "causeOfDeath",
+                  "martialStatusAtDeath",
+                  "ageAtDeath",
+                  "burialday",
+                  "burialplace",
+                  "burialplaceGOV"]
+    for addVar in addVarList:
+        i.update({addVar: ""})
+# transferring the data to the standard form columns
+for rowNumber, i in enumerate(data):
+    # "Z" is added to the ID in order to exclude identical IDs to the KLF
+    # variable "id" already exists in the KLK, which is why it is always updated
+    i.update({"id": "Z" + i["id"]})
+    try:
+        i["firstnameGiven"]
+    except KeyError:
+        i.update({"firstnameGiven": i["firstname"]})
+    try:
+        i["source"]
+    except KeyError:
+        i.update({"source": i["Band und Blatt"]})
+    try:
+        i["occupation"]
+    except KeyError:
+        i.update({"occupation": i["Stand/Beruf"]})
+    try:
+        i["sex"]
+    except KeyError:
+        if i["Geschlecht"] == "männlich":
+            i.update({"sex": "M"})
+        elif i["Geschlecht"] == "weiblich":
+            i.update({"sex": "F"})
+        else:
+            i.update({"sex": ""})
+    try:
+        i["deathday"]
+    except KeyError:
+        i.update({"deathday": i["Sterbedatum vor"]})
+    # surnames
+    parsedLastname = surnameSeperator(i["lastname"], i["sex"])
+    parsedLastnameKeys = ["surnameUnknown", "surnameMarriage1", "surnameMarriage2", "surnameMarriage3", "surnameGiven"]
+    for key in parsedLastnameKeys:
+        try:
+            i[key]
+        except KeyError:
+            i.update({key: parsedLastname[key]})
+    # with third person/relative
+    if i["Rolle"] == "Drittperson/Verwandter":
+        # save iD of the reference person
+        # data fields related person ID and related person name are interchanged
+        idRelative = "Z" + i["Name Bezugsperson"]
+        # type of relationship
+        # husband/wife/fiancé (fiancé is assumed to be the same as husband)
+        # "fiancée" does not occur
+        if i["Art der Beziehung"] == "Ehemann" or i["Art der Beziehung"] == "Ehefrau" or i[
+            "Art der Beziehung"] == "Verlobter":
+            # for a spouse, add ID of the spouse
+            if i["idSpouse1"] == "":
+                i.update({"idSpouse1": "Z" + i["Name Bezugsperson"]})
+            # search spouse
+            # the ID must also be supplemented
+            for n2, j in enumerate(data):
+                if "Z" + i["Name Bezugsperson"] == j["id"]:
+                    # complete ID of the spouse
+                    if j["idSpouse1"] == "":
+                        data[n2].update({"idSpouse1": i["id"]})
+        # brother/sister
+        elif i["Art der Beziehung"] == "Bruder" or i["Art der Beziehung"] == "Schwester":
+            # no information about the parents known
+            continue
+        # nephew/niece
+        elif i["Art der Beziehung"] == "Neffe" or i["Art der Beziehung"] == "Nichte":
+            # no information about the parents known
+            continue
+        # mother
+        elif i["Art der Beziehung"] == "Mutter":
+            # add the ID of the mother for the child
+            for n2, j in enumerate(data):
+                if "Z" + i["Name Bezugsperson"] == j["id"]:
+                    # add mother
+                    if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "":
+                        data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]})
+        # father
+        elif i["Art der Beziehung"] == "Vater":
+            # add the ID of the father for the child
+            for n2, j in enumerate(data):
+                if "Z" + i["Name Bezugsperson"] == j["id"]:
+                    # add father
+                    if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "":
+                        data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]})
+        # son/daughter
+        elif i["Art der Beziehung"] == "Sohn" or i["Art der Beziehung"] == "Tochter":
+            # add father
+            if i["Geschlecht"] == "männlich":
+                if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "":
+                    data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]})
+            # add mother
+            elif i["Geschlecht"] == "weilich":
+                if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "":
+                    data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]})
+            # not clear if it is father or mother
+            else:
+                continue
+        # do nothing for "Sonstige" or an empty field
+        else:
+            continue
+    # If "Eröffnung" in an event, then adopt the year as the year of death
+    eventList = ["Ereignis 1",
+                 "Ereignis 2",
+                 "Ereignis 3",
+                 "Ereignis 4",
+                 "Ereignis 5",
+                 "Ereignis 6",
+                 "Ereignis 7",
+                 "Ereignis 8"]
+    for event in eventList:
+        if "Eröffnung" in i[event]:
+            # opening happens a few days after death
+            # assumption here: It always happens in the year of death
+            # select year from string
+            # delete everything except numbers and take the last four of them
+            try:  # overwrite only if it does not already exist
+                i["deathday"]
+            except KeyError:
+                i.update({"deathday": re.sub("\zD", "", i[event])[-4:]})
+            break  # runtime improvement
+# spouses have separate index cards and may appear twice as reference persons
+# key here is the source reference of the wills
+# this is not shown in the standard form, so it must already be combined here
+# in new loop to continue with completely corrected values
+# result is a list containing the IDs to be merged
+# duplicates are merged here
+idList = []
+for rowNumber, i in enumerate(data):
+    if rowNumber % 500 == 0:
+        print("Status:", round(rowNumber * 100 / len(data), 2), "percent is finished")
+    # when person appears as a third party
+    if i["Rolle"] == "Drittperson/Verwandter":
+        idRelative = "Z" + i[
+            "Name Bezugsperson"]  # there is an error in the DES database in the assignment of the data field, actually this would be "Bezugsperson ID"
+        # search for the person with the ID
+        for n2, j in enumerate(data):
+            if j["id"] == idRelative:
+                # now the reference person was found, from which the source is now taken
+                sources = j["Band und Blatt"].split(";")
+                # search for whether a part of the source matches in another entry
+                noDouble = 0  # variable for at least one duplicate found
+                for n3, y in enumerate(data):
+                    # assumption: there can be only one match
+                    if noDouble == 1:
+                        break
+                    noDouble = 0  # variable for at least one double found
+                    sources2 = y["Band und Blatt"].split(";")
+                    breakVar = 0
+                    # comparison of surname component
+                    surnames1 = surnameSeperator(i["lastname"], i["sex"])
+                    surnames2 = surnameSeperator(y["lastname"], y["sex"])
+                    partOfNameIsEqual = 0
+                    for name1 in surnames1:
+                        for name2 in surnames2:
+                            if surnames1[name1] == surnames2[name2]:
+                                partOfNameIsEqual = 1
+                    for n4, source in enumerate(sources):
+                        if breakVar == 1:  # passing on of the break from the inner loop
+                            break;
+                        if source == "":
+                            continue  # then there is an empty source: although an incorrect entry in the data set, it causes a lot of damage
+                        # it is not enough to just look if source is in source2, it must be the same
+                        for n5, source2 in enumerate(sources2):
+                            if source == source2 and y["id"] != idRelative and y[
+                                "Rolle"] == "Erblasser" and partOfNameIsEqual == 1:  # i["lastname"] in y["lastname"]: # 2. Bed: Soll nicht gleich die ursprüngliche Person sein, 4. Bed: nachnamen gleich sein, weil eine Person zwei Drittpersonen haben kann; name der Drittperson oft weniger umfangreich
+                                idList.append([i["id"], y["id"]])
+                                noDouble = 1
+                                breakVar = 1
+                                break  # continue to next person
+                    if noDouble == 0 and n3 == len(
+                            data) - 1:  # if the last one has not been found yet, then it is only one person
+                        idList.append([i["id"]])
+    # also print if there is no third person, otherwise only the third persons appear
+    else:
+        idList.append([i["id"]])
+# some IDs to be merged are present at this place at the same time again individually
+# leads to the double output of persons
+# there is to underprint, the single persons must be removed
+# generate a list with the persons to be merged
+idListMergedPerson = []
+for i in idList:
+    if len(i) != 1:
+        for idMergedPerson in i:
+            idListMergedPerson.append(idMergedPerson)
+# generate a list filtering out the IDs that are already merged
+newIdList = []
+for i in idList:
+    if len(i) != 1:
+        newIdList.append(i)  # maintain the ones to be merged
+    elif len(i) == 1:
+        if i[0] not in idListMergedPerson:
+            newIdList.append(i)
+idList = newIdList
+# update ID of the reference person
+for i in data:
+    # if the ID is present in entries to be merged
+    for idNew, double in enumerate(idList):
+        if i["id"] in double and len(double) == 2:  # only those with two entries
+            idOld = i["id"]
+            # the reference IDs still need to be changed for all of them
+            # overwrite data
+            idKeys = ["Name Bezugsperson", "idFather", "idMother", "idSpouse1", "idSpouse2",
+                      "idSpouse3"]  # List can be extended if there are more ID columns
+            # for each ID, if it exists, search idList
+            for idKey in idKeys:
+                for rowNumber, entry in enumerate(data):  # search the old list
+                    # find same IDs
+                    if entry[idKey] == idOld:
+                        # overwrite
+                        data[rowNumber][idKey] = "Z" + str(idNew)  # change original list
+# merge idList
+newData = []
+for idNew, double in enumerate(idList):
+    if len(double) == 1:
+        # search entry
+        for i in data:
+            if i["id"] == double[0]:
+                newData.append(i)
+                break
+    else:  # several that need to be merged
+        for i in data:
+            if i["id"] == double[0]:
+                firstEntry = i
+                break
+        for j in data:
+            if j["id"] == double[1]:
+                secondEntry = j
+                break
+        # merge firstEntry and secondEntry
+        matchedDict = {}
+        for key in i:
+            if key == "id":  # rewrite new ID and other IDs
+                matchedDict.update({key: "Z" + str(idNew)})
+                continue
+            if i[key] == j[key]:
+                matchedDict.update({key: i[key]})
+            else:
+                # if only one contains information that take
+                if firstEntry[key] == "" and secondEntry[key] != "":
+                    matchedDict.update({key: secondEntry[key]})
+                elif firstEntry[key] != "" and secondEntry[key] == "":
+                    matchedDict.update({key: firstEntry[key]})
+                # when one is part of the other, only the longer writing
+                elif str(firstEntry[key]) in str(secondEntry[key]):
+                    matchedDict.update({key: secondEntry[key]})
+                elif str(secondEntry[key]) in str(firstEntry[key]):
+                    matchedDict.update({key: firstEntry[key]})
+                # merge different information while retaining all information
+                elif key in ["Rolle", "page", "Stand/Beruf"]:
+                    matchedDict.update({key: (firstEntry[key] + ", " + firstEntry[key])})
+                # Survivors:
+                else:  # both contain information
+                    # here are mainly spelling mistakes, abbreviations or dates that are close to each other
+                    # so you can simply select one of those
+                    matchedDict.update({key: firstEntry[key]})
+        newData.append(matchedDict)
+# delete keys that are no longer needed
+for rowNumber, i in enumerate(newData):
+    keysDelete = ["page",
+                  "lastname",
+                  "firstname",
+                  "Stand/Beruf",
+                  "Rolle",
+                  "Ort",
+                  "Band und Blatt",
+                  "Familienstand",
+                  "Ereignis 1",
+                  "Ereignis 2",
+                  "Ereignis 3",
+                  "Ereignis 4",
+                  "Ereignis 5",
+                  "Ereignis 6",
+                  "Ereignis 7",
+                  "Ereignis 8",
+                  "Geschlecht",
+                  "Bezugsperson ID",
+                  "Name Bezugsperson",
+                  "Art der Beziehung",
+                  "Sterbedatum vor",
+                  "Datum von",
+                  "Datum bis",
+                  "Bemerkung"]
+    for key in keysDelete:
+        i.pop(key, None)
+# save standardized data
+fieldnames = newData[0].keys()
+writer = open("leipzig_testamente_normform.csv", "w", newline="", encoding=encoding)
+dataWriter = csv.DictWriter(writer, fieldnames=fieldnames, delimiter=delimiter)
+dataWriter.writeheader()
+dataWriter.writerows(newData)
+writer.close()
+print("Status: Finished")