Skip to content
Snippets Groups Projects
Commit 715e1270 authored by Marcus Baumgarten's avatar Marcus Baumgarten
Browse files

Neue Datei hochladen

parent 413832e4
No related branches found
No related tags found
No related merge requests found
import csv
import json
import re
# load data
encoding = "utf-8"
delimiter = "\t"
data = []
with open("leipzig_testamente-gesamt.csv", "r", encoding=encoding) as file:
for i in csv.DictReader(file, delimiter=delimiter):
i = json.loads(json.dumps(i))
data.append(i)
def surnameSeperator(lastname, sex):
"""
This function breaks the last name into its components.
:param lastname: surname (string)
:param sex: sex, binary (string)
:return: overview of change of surname (dictionary)
"""
# initialization of the variables
surnameMarriage1 = ""
surnameMarriage2 = ""
surnameMarriage3 = ""
surnameGiven = ""
surnameUnknown = ""
# parsing the surname
# recognize the birth name
if "geb." in lastname:
surnameGiven = lastname[lastname.find("geb.") + len("geb.") + 1:]
elif "Geb." in lastname:
surnameGiven = lastname[lastname.find("Geb.") + len("Geb.") + 1:]
# recognize the married name
elif "verw." in lastname:
if surnameMarriage1 == "":
surnameMarriage1 = lastname[lastname.find("verw.") + len("verw.") + 1:]
elif surnameMarriage2 == "":
surnameMarriage2 = lastname[lastname.find("verw.") + len("verw.") + 1:]
elif surnameMarriage3 == "":
surnameMarriage3 = lastname[lastname.find("verw.") + len("verw.") + 1:]
elif "verehel." in lastname:
if surnameMarriage1 == "":
surnameMarriage1 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
elif surnameMarriage2 == "":
surnameMarriage2 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
elif surnameMarriage3 == "":
surnameMarriage3 = lastname[lastname.find("verehel.") + len("verehel.") + 1:]
surnameGiven = lastname[:lastname.find("verehel.")]
if surnameGiven[-1:] == " ":
surnameGiven = surnameGiven[:-1]
if surnameGiven[-1:] == ",":
surnameGiven = surnameGiven[:-1]
elif "Verehe." in lastname:
surnameActual = lastname[lastname.find("Verehe.") + len("Verehe.") + 1:]
surnameGiven = lastname[:lastname.find("Verehe.")]
if surnameGiven[-1:] == " ":
surnameGiven = surnameGiven[:-1]
if surnameGiven[-1:] == ",":
surnameGiven = surnameGiven[:-1]
else:
# for men the surname it is always the birth name
if sex == "M":
surnameGiven = lastname
# for women this is not clear
else: # female
surnameUnknown = lastname
# dictionary creation
surnameDict = {
"surnameMarriage1": surnameMarriage1,
"surnameMarriage2": surnameMarriage2,
"surnameMarriage3": surnameMarriage3,
"surnameGiven": surnameGiven,
"surnameUnknown": surnameUnknown}
return (surnameDict)
# correction of the sources
# sometimes separates numbers with ";" instead of ",", e.g. Bd. 34 Bl. 23-24; 29
for n1, i in enumerate(data):
# remove semicolons at the end
if i["Band und Blatt"][-1:] == ";":
i["Band und Blatt"] = i["Band und Blatt"][:-1]
elif i["Band und Blatt"][-2:] == "; ":
i["Band und Blatt"] = i["Band und Blatt"][:-2]
# if there is no capital "B" after a semicolon, make it a comma
positionList = []
for n2, character in enumerate(i["Band und Blatt"]):
if character == ";":
if i["Band und Blatt"][n2 + 2] != "B":
data[n1]["Band und Blatt"] = data[n1]["Band und Blatt"][:n2] + "," + data[n1]["Band und Blatt"][n2 + 1:]
# add empty columns
for rowNumber, i in enumerate(data):
keysAppend = ["idSpouse1",
"idSpouse2",
"idSpouse3",
"idFather",
"idMother"]
for key in keysAppend:
try:
data[rowNumber][key]
except KeyError:
data[rowNumber].update({key: ""})
# addition of variables that are irrelevant here but occur in the norm form
addVarList = ["firstnameChange",
"surnameChange",
"birthday",
"birthplace",
"birthplaceGOV",
"growthUpPlace",
"growthUpPlaceGOV",
"baptismday",
"baptismplace",
"baptismplaceGOV",
"marriageday1",
"marriageday2",
"marriageday3",
"marriageplace1",
"marriageplace2",
"marriageplace3",
"marriageplaceGOV1",
"marriageplaceGOV2",
"marriageplaceGOV3",
"ageAtMarriage1",
"ageAtMarriage2",
"ageAtMarriage3",
"divorceday1",
"divorceday2",
"divorceday3",
"deathplace",
"deathplaceGOV",
"causeOfDeath",
"martialStatusAtDeath",
"ageAtDeath",
"burialday",
"burialplace",
"burialplaceGOV"]
for addVar in addVarList:
i.update({addVar: ""})
# transferring the data to the standard form columns
for rowNumber, i in enumerate(data):
# "Z" is added to the ID in order to exclude identical IDs to the KLF
# variable "id" already exists in the KLK, which is why it is always updated
i.update({"id": "Z" + i["id"]})
try:
i["firstnameGiven"]
except KeyError:
i.update({"firstnameGiven": i["firstname"]})
try:
i["source"]
except KeyError:
i.update({"source": i["Band und Blatt"]})
try:
i["occupation"]
except KeyError:
i.update({"occupation": i["Stand/Beruf"]})
try:
i["sex"]
except KeyError:
if i["Geschlecht"] == "männlich":
i.update({"sex": "M"})
elif i["Geschlecht"] == "weiblich":
i.update({"sex": "F"})
else:
i.update({"sex": ""})
try:
i["deathday"]
except KeyError:
i.update({"deathday": i["Sterbedatum vor"]})
# surnames
parsedLastname = surnameSeperator(i["lastname"], i["sex"])
parsedLastnameKeys = ["surnameUnknown", "surnameMarriage1", "surnameMarriage2", "surnameMarriage3", "surnameGiven"]
for key in parsedLastnameKeys:
try:
i[key]
except KeyError:
i.update({key: parsedLastname[key]})
# with third person/relative
if i["Rolle"] == "Drittperson/Verwandter":
# save iD of the reference person
# data fields related person ID and related person name are interchanged
idRelative = "Z" + i["Name Bezugsperson"]
# type of relationship
# husband/wife/fiancé (fiancé is assumed to be the same as husband)
# "fiancée" does not occur
if i["Art der Beziehung"] == "Ehemann" or i["Art der Beziehung"] == "Ehefrau" or i[
"Art der Beziehung"] == "Verlobter":
# for a spouse, add ID of the spouse
if i["idSpouse1"] == "":
i.update({"idSpouse1": "Z" + i["Name Bezugsperson"]})
# search spouse
# the ID must also be supplemented
for n2, j in enumerate(data):
if "Z" + i["Name Bezugsperson"] == j["id"]:
# complete ID of the spouse
if j["idSpouse1"] == "":
data[n2].update({"idSpouse1": i["id"]})
# brother/sister
elif i["Art der Beziehung"] == "Bruder" or i["Art der Beziehung"] == "Schwester":
# no information about the parents known
continue
# nephew/niece
elif i["Art der Beziehung"] == "Neffe" or i["Art der Beziehung"] == "Nichte":
# no information about the parents known
continue
# mother
elif i["Art der Beziehung"] == "Mutter":
# add the ID of the mother for the child
for n2, j in enumerate(data):
if "Z" + i["Name Bezugsperson"] == j["id"]:
# add mother
if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "":
data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]})
# father
elif i["Art der Beziehung"] == "Vater":
# add the ID of the father for the child
for n2, j in enumerate(data):
if "Z" + i["Name Bezugsperson"] == j["id"]:
# add father
if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "":
data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]})
# son/daughter
elif i["Art der Beziehung"] == "Sohn" or i["Art der Beziehung"] == "Tochter":
# add father
if i["Geschlecht"] == "männlich":
if data[n2]["idFather"] == "" and data[n2]["Name Bezugsperson"] != "":
data[n2].update({"idFather": "Z" + data[n2]["Name Bezugsperson"]})
# add mother
elif i["Geschlecht"] == "weilich":
if data[n2]["idMother"] == "" and data[n2]["Name Bezugsperson"] != "":
data[n2].update({"idMother": "Z" + data[n2]["Name Bezugsperson"]})
# not clear if it is father or mother
else:
continue
# do nothing for "Sonstige" or an empty field
else:
continue
# If "Eröffnung" in an event, then adopt the year as the year of death
eventList = ["Ereignis 1",
"Ereignis 2",
"Ereignis 3",
"Ereignis 4",
"Ereignis 5",
"Ereignis 6",
"Ereignis 7",
"Ereignis 8"]
for event in eventList:
if "Eröffnung" in i[event]:
# opening happens a few days after death
# assumption here: It always happens in the year of death
# select year from string
# delete everything except numbers and take the last four of them
try: # overwrite only if it does not already exist
i["deathday"]
except KeyError:
i.update({"deathday": re.sub("\zD", "", i[event])[-4:]})
break # runtime improvement
# spouses have separate index cards and may appear twice as reference persons
# key here is the source reference of the wills
# this is not shown in the standard form, so it must already be combined here
# in new loop to continue with completely corrected values
# result is a list containing the IDs to be merged
# duplicates are merged here
idList = []
for rowNumber, i in enumerate(data):
if rowNumber % 500 == 0:
print("Status:", round(rowNumber * 100 / len(data), 2), "percent is finished")
# when person appears as a third party
if i["Rolle"] == "Drittperson/Verwandter":
idRelative = "Z" + i[
"Name Bezugsperson"] # there is an error in the DES database in the assignment of the data field, actually this would be "Bezugsperson ID"
# search for the person with the ID
for n2, j in enumerate(data):
if j["id"] == idRelative:
# now the reference person was found, from which the source is now taken
sources = j["Band und Blatt"].split(";")
# search for whether a part of the source matches in another entry
noDouble = 0 # variable for at least one duplicate found
for n3, y in enumerate(data):
# assumption: there can be only one match
if noDouble == 1:
break
noDouble = 0 # variable for at least one double found
sources2 = y["Band und Blatt"].split(";")
breakVar = 0
# comparison of surname component
surnames1 = surnameSeperator(i["lastname"], i["sex"])
surnames2 = surnameSeperator(y["lastname"], y["sex"])
partOfNameIsEqual = 0
for name1 in surnames1:
for name2 in surnames2:
if surnames1[name1] == surnames2[name2]:
partOfNameIsEqual = 1
for n4, source in enumerate(sources):
if breakVar == 1: # passing on of the break from the inner loop
break;
if source == "":
continue # then there is an empty source: although an incorrect entry in the data set, it causes a lot of damage
# it is not enough to just look if source is in source2, it must be the same
for n5, source2 in enumerate(sources2):
if source == source2 and y["id"] != idRelative and y[
"Rolle"] == "Erblasser" and partOfNameIsEqual == 1: # i["lastname"] in y["lastname"]: # 2. Bed: Soll nicht gleich die ursprüngliche Person sein, 4. Bed: nachnamen gleich sein, weil eine Person zwei Drittpersonen haben kann; name der Drittperson oft weniger umfangreich
idList.append([i["id"], y["id"]])
noDouble = 1
breakVar = 1
break # continue to next person
if noDouble == 0 and n3 == len(
data) - 1: # if the last one has not been found yet, then it is only one person
idList.append([i["id"]])
# also print if there is no third person, otherwise only the third persons appear
else:
idList.append([i["id"]])
# some IDs to be merged are present at this place at the same time again individually
# leads to the double output of persons
# there is to underprint, the single persons must be removed
# generate a list with the persons to be merged
idListMergedPerson = []
for i in idList:
if len(i) != 1:
for idMergedPerson in i:
idListMergedPerson.append(idMergedPerson)
# generate a list filtering out the IDs that are already merged
newIdList = []
for i in idList:
if len(i) != 1:
newIdList.append(i) # maintain the ones to be merged
elif len(i) == 1:
if i[0] not in idListMergedPerson:
newIdList.append(i)
idList = newIdList
# update ID of the reference person
for i in data:
# if the ID is present in entries to be merged
for idNew, double in enumerate(idList):
if i["id"] in double and len(double) == 2: # only those with two entries
idOld = i["id"]
# the reference IDs still need to be changed for all of them
# overwrite data
idKeys = ["Name Bezugsperson", "idFather", "idMother", "idSpouse1", "idSpouse2",
"idSpouse3"] # List can be extended if there are more ID columns
# for each ID, if it exists, search idList
for idKey in idKeys:
for rowNumber, entry in enumerate(data): # search the old list
# find same IDs
if entry[idKey] == idOld:
# overwrite
data[rowNumber][idKey] = "Z" + str(idNew) # change original list
# merge idList
newData = []
for idNew, double in enumerate(idList):
if len(double) == 1:
# search entry
for i in data:
if i["id"] == double[0]:
newData.append(i)
break
else: # several that need to be merged
for i in data:
if i["id"] == double[0]:
firstEntry = i
break
for j in data:
if j["id"] == double[1]:
secondEntry = j
break
# merge firstEntry and secondEntry
matchedDict = {}
for key in i:
if key == "id": # rewrite new ID and other IDs
matchedDict.update({key: "Z" + str(idNew)})
continue
if i[key] == j[key]:
matchedDict.update({key: i[key]})
else:
# if only one contains information that take
if firstEntry[key] == "" and secondEntry[key] != "":
matchedDict.update({key: secondEntry[key]})
elif firstEntry[key] != "" and secondEntry[key] == "":
matchedDict.update({key: firstEntry[key]})
# when one is part of the other, only the longer writing
elif str(firstEntry[key]) in str(secondEntry[key]):
matchedDict.update({key: secondEntry[key]})
elif str(secondEntry[key]) in str(firstEntry[key]):
matchedDict.update({key: firstEntry[key]})
# merge different information while retaining all information
elif key in ["Rolle", "page", "Stand/Beruf"]:
matchedDict.update({key: (firstEntry[key] + ", " + firstEntry[key])})
# Survivors:
else: # both contain information
# here are mainly spelling mistakes, abbreviations or dates that are close to each other
# so you can simply select one of those
matchedDict.update({key: firstEntry[key]})
newData.append(matchedDict)
# delete keys that are no longer needed
for rowNumber, i in enumerate(newData):
keysDelete = ["page",
"lastname",
"firstname",
"Stand/Beruf",
"Rolle",
"Ort",
"Band und Blatt",
"Familienstand",
"Ereignis 1",
"Ereignis 2",
"Ereignis 3",
"Ereignis 4",
"Ereignis 5",
"Ereignis 6",
"Ereignis 7",
"Ereignis 8",
"Geschlecht",
"Bezugsperson ID",
"Name Bezugsperson",
"Art der Beziehung",
"Sterbedatum vor",
"Datum von",
"Datum bis",
"Bemerkung"]
for key in keysDelete:
i.pop(key, None)
# save standardized data
fieldnames = newData[0].keys()
writer = open("leipzig_testamente_normform.csv", "w", newline="", encoding=encoding)
dataWriter = csv.DictWriter(writer, fieldnames=fieldnames, delimiter=delimiter)
dataWriter.writeheader()
dataWriter.writerows(newData)
writer.close()
print("Status: Finished")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment