diff --git a/normform_KLK.py b/normform_KLK.py new file mode 100644 index 0000000000000000000000000000000000000000..dc23c5aecbfd1890467d5c51499e7f3e74793bfe --- /dev/null +++ b/normform_KLK.py @@ -0,0 +1,467 @@ +import csv +import json +import re + +# load data +encoding = "utf-8" +delimiter = "\t" +data = [] +with open("leipzig_testamente-gesamt.csv", "r", encoding=encoding) as file: + for i in csv.DictReader(file, delimiter=delimiter): + i = json.loads(json.dumps(i)) + data.append(i) + + +def surnameSeperator(lastname, sex): + """ + This function breaks the last name into its components. + :param lastname: surname (string) + :param sex: sex, binary (string) + :return: overview of change of surname (dictionary) + """ + # initialization of the variables + surnameMarriage1 = "" + surnameMarriage2 = "" + surnameMarriage3 = "" + surnameGiven = "" + surnameUnknown = "" + + # parsing the surname + # recognize the birth name + if "geb." in lastname: + surnameGiven = lastname[lastname.find("geb.") + len("geb.") + 1:] + elif "Geb." in lastname: + surnameGiven = lastname[lastname.find("Geb.") + len("Geb.") + 1:] + # recognize the married name + elif "verw." in lastname: + if surnameMarriage1 == "": + surnameMarriage1 = lastname[lastname.find("verw.") + len("verw.") + 1:] + elif surnameMarriage2 == "": + surnameMarriage2 = lastname[lastname.find("verw.") + len("verw.") + 1:] + elif surnameMarriage3 == "": + surnameMarriage3 = lastname[lastname.find("verw.") + len("verw.") + 1:] + elif "verehel." in lastname: + if surnameMarriage1 == "": + surnameMarriage1 = lastname[lastname.find("verehel.") + len("verehel.") + 1:] + elif surnameMarriage2 == "": + surnameMarriage2 = lastname[lastname.find("verehel.") + len("verehel.") + 1:] + elif surnameMarriage3 == "": + surnameMarriage3 = lastname[lastname.find("verehel.") + len("verehel.") + 1:] + surnameGiven = lastname[:lastname.find("verehel.")] + if surnameGiven[-1:] == " ": + surnameGiven = surnameGiven[:-1] + if surnameGiven[-1:] == ",": + surnameGiven = surnameGiven[:-1] + elif "Verehe." in lastname: + surnameActual = lastname[lastname.find("Verehe.") + len("Verehe.") + 1:] + surnameGiven = lastname[:lastname.find("Verehe.")] + if surnameGiven[-1:] == " ": + surnameGiven = surnameGiven[:-1] + if surnameGiven[-1:] == ",": + surnameGiven = surnameGiven[:-1] + else: + # for men the surname it is always the birth name + if sex == "M": + surnameGiven = lastname + # for women this is not clear + else: # female + surnameUnknown = lastname + + # dictionary creation + surnameDict = { + "surnameMarriage1": surnameMarriage1, + "surnameMarriage2": surnameMarriage2, + "surnameMarriage3": surnameMarriage3, + "surnameGiven": surnameGiven, + "surnameUnknown": surnameUnknown} + + return (surnameDict) + + +# correction of the sources +# sometimes separates numbers with ";" instead of ",", e.g. Bd. 34 Bl. 23-24; 29 +for n1, i in enumerate(data): + # remove semicolons at the end + if i["Band und Blatt"][-1:] == ";": + i["Band und Blatt"] = i["Band und Blatt"][:-1] + elif i["Band und Blatt"][-2:] == "; ": + i["Band und Blatt"] = i["Band und Blatt"][:-2] + # if there is no capital "B" after a semicolon, make it a comma + positionList = [] + for n2, character in enumerate(i["Band und Blatt"]): + if character == ";": + if i["Band und Blatt"][n2 + 2] != "B": + data[n1]["Band und Blatt"] = data[n1]["Band und Blatt"][:n2] + "," + data[n1]["Band und Blatt"][n2 + 1:] + +# add empty columns +for rowNumber, i in enumerate(data): + keysAppend = ["idSpouse1", + "idSpouse2", + "idSpouse3", + "idFather", + "idMother"] + for key in keysAppend: + try: + data[rowNumber][key] + except KeyError: + data[rowNumber].update({key: ""}) + + # addition of variables that are irrelevant here but occur in the norm form + addVarList = ["firstnameChange", + "surnameChange", + "birthday", + "birthplace", + "birthplaceGOV", + "growthUpPlace", + "growthUpPlaceGOV", + "baptismday", + "baptismplace", + "baptismplaceGOV", + "marriageday1", + "marriageday2", + "marriageday3", + "marriageplace1", + "marriageplace2", + "marriageplace3", + "marriageplaceGOV1", + "marriageplaceGOV2", + "marriageplaceGOV3", + "ageAtMarriage1", + "ageAtMarriage2", + "ageAtMarriage3", + "divorceday1", + "divorceday2", + "divorceday3", + "deathplace", + "deathplaceGOV", + "causeOfDeath", + "martialStatusAtDeath", + "ageAtDeath", + "burialday", + "burialplace", + "burialplaceGOV"] + for addVar in addVarList: + i.update({addVar: ""}) + +# transferring the data to the standard form columns +for rowNumber, i in enumerate(data): + # "Z" is added to the ID in order to exclude identical IDs to the KLF + # variable "id" already exists in the KLK, which is why it is always updated + i.update({"id": "Z" + i["id"]}) + try: + i["firstnameGiven"] + except KeyError: + i.update({"firstnameGiven": i["firstname"]}) + try: + i["source"] + except KeyError: + i.update({"source": i["Band und Blatt"]}) + try: + i["occupation"] + except KeyError: + i.update({"occupation": i["Stand/Beruf"]}) + try: + i["sex"] + except KeyError: + if i["Geschlecht"] == "männlich": + i.update({"sex": "M"}) + elif i["Geschlecht"] == "weiblich": + i.update({"sex": "F"}) + else: + i.update({"sex": ""}) + try: + i["deathday"] + except KeyError: + i.update({"deathday": i["Sterbedatum vor"]}) + + # surnames + parsedLastname = surnameSeperator(i["lastname"], i["sex"]) + parsedLastnameKeys = ["surnameUnknown", "surnameMarriage1", "surnameMarriage2", "surnameMarriage3", "surnameGiven"] + for key in parsedLastnameKeys: + try: + i[key] + except KeyError: + i.update({key: parsedLastname[key]}) + + # with third person/relative + if i["Rolle"] == "Drittperson/Verwandter": + # save iD of the reference person + # data fields related person ID and related person name are interchanged + idRelative = "Z" + i["Name Bezugsperson"] + + # type of relationship + + # husband/wife/fiancé (fiancé is assumed to be the same as husband) + # "fiancée" does not occur + if i["Art der Beziehung"] == "Ehemann" or i["Art der Beziehung"] == "Ehefrau" or i[ + "Art der Beziehung"] == "Verlobter": + # for a spouse, add ID of the spouse + if i["idSpouse1"] == "": + i.update({"idSpouse1": "Z" + i["Name Bezugsperson"]}) + + # search spouse + # the ID must also be supplemented + for n2, j in enumerate(data): + if "Z" + i["Name Bezugsperson"] == j["id"]: + # complete ID of the spouse + if j["idSpouse1"] == "": + data[n2].update({"idSpouse1": i["id"]}) + + # brother/sister + elif i["Art der Beziehung"] == "Bruder" or i["Art der Beziehung"] == "Schwester": + # no information about the parents known + continue + + # nephew/niece + elif i["Art der Beziehung"] == "Neffe" or i["Art der Beziehung"] == "Nichte": + # no information about the parents known + continue + + # mother + elif i["Art der Beziehung"] == "Mutter": + # add the ID of the mother for the child + for n2, j in enumerate(data): + if "Z" + i["Name Bezugsperson"] == j["id"]: + # add mother + if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "": + data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]}) + + # father + elif i["Art der Beziehung"] == "Vater": + # add the ID of the father for the child + for n2, j in enumerate(data): + if "Z" + i["Name Bezugsperson"] == j["id"]: + # add father + if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "": + data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]}) + + # son/daughter + elif i["Art der Beziehung"] == "Sohn" or i["Art der Beziehung"] == "Tochter": + # add father + if i["Geschlecht"] == "männlich": + if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "": + data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]}) + # add mother + elif i["Geschlecht"] == "weilich": + if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "": + data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]}) + # not clear if it is father or mother + else: + continue + + # do nothing for "Sonstige" or an empty field + else: + continue + + # If "Eröffnung" in an event, then adopt the year as the year of death + eventList = ["Ereignis 1", + "Ereignis 2", + "Ereignis 3", + "Ereignis 4", + "Ereignis 5", + "Ereignis 6", + "Ereignis 7", + "Ereignis 8"] + for event in eventList: + if "Eröffnung" in i[event]: + # opening happens a few days after death + # assumption here: It always happens in the year of death + # select year from string + # delete everything except numbers and take the last four of them + try: # overwrite only if it does not already exist + i["deathday"] + except KeyError: + i.update({"deathday": re.sub("\zD", "", i[event])[-4:]}) + break # runtime improvement + +# spouses have separate index cards and may appear twice as reference persons +# key here is the source reference of the wills +# this is not shown in the standard form, so it must already be combined here +# in new loop to continue with completely corrected values +# result is a list containing the IDs to be merged +# duplicates are merged here +idList = [] +for rowNumber, i in enumerate(data): + + if rowNumber % 500 == 0: + print("Status:", round(rowNumber * 100 / len(data), 2), "percent is finished") + + # when person appears as a third party + if i["Rolle"] == "Drittperson/Verwandter": + idRelative = "Z" + i[ + "Name Bezugsperson"] # there is an error in the DES database in the assignment of the data field, actually this would be "Bezugsperson ID" + # search for the person with the ID + for n2, j in enumerate(data): + if j["id"] == idRelative: + # now the reference person was found, from which the source is now taken + sources = j["Band und Blatt"].split(";") + # search for whether a part of the source matches in another entry + noDouble = 0 # variable for at least one duplicate found + for n3, y in enumerate(data): + + # assumption: there can be only one match + if noDouble == 1: + break + + noDouble = 0 # variable for at least one double found + sources2 = y["Band und Blatt"].split(";") + breakVar = 0 + + # comparison of surname component + surnames1 = surnameSeperator(i["lastname"], i["sex"]) + surnames2 = surnameSeperator(y["lastname"], y["sex"]) + partOfNameIsEqual = 0 + for name1 in surnames1: + for name2 in surnames2: + if surnames1[name1] == surnames2[name2]: + partOfNameIsEqual = 1 + + for n4, source in enumerate(sources): + + if breakVar == 1: # passing on of the break from the inner loop + break; + + if source == "": + continue # then there is an empty source: although an incorrect entry in the data set, it causes a lot of damage + + # it is not enough to just look if source is in source2, it must be the same + for n5, source2 in enumerate(sources2): + + if source == source2 and y["id"] != idRelative and y[ + "Rolle"] == "Erblasser" and partOfNameIsEqual == 1: # i["lastname"] in y["lastname"]: # 2. Bed: Soll nicht gleich die ursprüngliche Person sein, 4. Bed: nachnamen gleich sein, weil eine Person zwei Drittpersonen haben kann; name der Drittperson oft weniger umfangreich + idList.append([i["id"], y["id"]]) + noDouble = 1 + breakVar = 1 + break # continue to next person + + if noDouble == 0 and n3 == len( + data) - 1: # if the last one has not been found yet, then it is only one person + idList.append([i["id"]]) + # also print if there is no third person, otherwise only the third persons appear + else: + idList.append([i["id"]]) + +# some IDs to be merged are present at this place at the same time again individually +# leads to the double output of persons +# there is to underprint, the single persons must be removed +# generate a list with the persons to be merged +idListMergedPerson = [] +for i in idList: + if len(i) != 1: + for idMergedPerson in i: + idListMergedPerson.append(idMergedPerson) +# generate a list filtering out the IDs that are already merged +newIdList = [] +for i in idList: + if len(i) != 1: + newIdList.append(i) # maintain the ones to be merged + elif len(i) == 1: + if i[0] not in idListMergedPerson: + newIdList.append(i) +idList = newIdList + +# update ID of the reference person +for i in data: + # if the ID is present in entries to be merged + for idNew, double in enumerate(idList): + if i["id"] in double and len(double) == 2: # only those with two entries + idOld = i["id"] + # the reference IDs still need to be changed for all of them + # overwrite data + idKeys = ["Name Bezugsperson", "idFather", "idMother", "idSpouse1", "idSpouse2", + "idSpouse3"] # List can be extended if there are more ID columns + # for each ID, if it exists, search idList + for idKey in idKeys: + for rowNumber, entry in enumerate(data): # search the old list + # find same IDs + if entry[idKey] == idOld: + # overwrite + data[rowNumber][idKey] = "Z" + str(idNew) # change original list + +# merge idList +newData = [] +for idNew, double in enumerate(idList): + if len(double) == 1: + # search entry + for i in data: + if i["id"] == double[0]: + newData.append(i) + break + else: # several that need to be merged + for i in data: + if i["id"] == double[0]: + firstEntry = i + break + for j in data: + if j["id"] == double[1]: + secondEntry = j + break + + # merge firstEntry and secondEntry + matchedDict = {} + for key in i: + if key == "id": # rewrite new ID and other IDs + matchedDict.update({key: "Z" + str(idNew)}) + continue + if i[key] == j[key]: + matchedDict.update({key: i[key]}) + else: + # if only one contains information that take + if firstEntry[key] == "" and secondEntry[key] != "": + matchedDict.update({key: secondEntry[key]}) + elif firstEntry[key] != "" and secondEntry[key] == "": + matchedDict.update({key: firstEntry[key]}) + # when one is part of the other, only the longer writing + elif str(firstEntry[key]) in str(secondEntry[key]): + matchedDict.update({key: secondEntry[key]}) + elif str(secondEntry[key]) in str(firstEntry[key]): + matchedDict.update({key: firstEntry[key]}) + # merge different information while retaining all information + elif key in ["Rolle", "page", "Stand/Beruf"]: + matchedDict.update({key: (firstEntry[key] + ", " + firstEntry[key])}) + # Survivors: + else: # both contain information + # here are mainly spelling mistakes, abbreviations or dates that are close to each other + # so you can simply select one of those + matchedDict.update({key: firstEntry[key]}) + newData.append(matchedDict) + +# delete keys that are no longer needed +for rowNumber, i in enumerate(newData): + keysDelete = ["page", + "lastname", + "firstname", + "Stand/Beruf", + "Rolle", + "Ort", + "Band und Blatt", + "Familienstand", + "Ereignis 1", + "Ereignis 2", + "Ereignis 3", + "Ereignis 4", + "Ereignis 5", + "Ereignis 6", + "Ereignis 7", + "Ereignis 8", + "Geschlecht", + "Bezugsperson ID", + "Name Bezugsperson", + "Art der Beziehung", + "Sterbedatum vor", + "Datum von", + "Datum bis", + "Bemerkung"] + for key in keysDelete: + i.pop(key, None) + +# save standardized data +fieldnames = newData[0].keys() +writer = open("leipzig_testamente_normform.csv", "w", newline="", encoding=encoding) +dataWriter = csv.DictWriter(writer, fieldnames=fieldnames, delimiter=delimiter) +dataWriter.writeheader() +dataWriter.writerows(newData) +writer.close() + +print("Status: Finished") +