diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..8b02b047e211af4fce30cbc78b333b780bbc15c7 --- /dev/null +++ b/main.py @@ -0,0 +1,2572 @@ +import csv +import kph +import datetime +import re +import klfnameresolver +import occuresolver +import copy +import astropy.time +import time +import cProfile +import operator +from functools import lru_cache +from pyjarowinkler import distance +from dateutil import relativedelta as rd +from datetime import datetime, timedelta + + +def main(): + """ + This function executes the programme. + """ + # if the variable is 1, the data tables are sorted by the last name (givenSurname) + # furthermore only persons are compared, whose first letter of the surname is the same + # this has advantages at runtime, but then the same persons are no longer recognized, whose first letter of the surname is no longer the same (e.g. "Pauer" and "Bauer") + # currently only works if exactly the same tables are compared + sortingBySurnameGiven = 1 + + # start profiler + pr = cProfile.Profile() + pr.enable() + + # start timer + startTime = time.time() + + # definition of the input data in the standard form + csvfile1 = "normform1.csv" + csvfile2 = "normform2.csv" + + # loading the records + entries1 = fileReader(csvfile1) + entries2 = fileReader(csvfile2) + + # sorting by last name, if desired + if sortingBySurnameGiven == 1: + # first letters must be capitalized, because otherwise here is sorted first by capital letters + for element in entries1: + element["surnameGiven"] = element["surnameGiven"].capitalize() + for element in entries2: + element["surnameGiven"] = element["surnameGiven"].capitalize() + entries1.sort(key=operator.itemgetter("surnameGiven")) + entries2.sort(key=operator.itemgetter("surnameGiven")) + + # create a dictionary using the search space for the letters (last letters in each case) + letterDict = {} + oldLetter = "" + for index, person in enumerate(entries2): + newLetter = person["surnameGiven"][:1].lower() + if newLetter != oldLetter: + letterDict.update({oldLetter: index}) # lower case letters + letterDict.update({oldLetter.upper(): index}) # upper case letters + oldLetter = newLetter + # the last letter is no longer changed, so it must be added separately + letterDict.update({newLetter: index}) + letterDict.update({newLetter.upper(): index}) + + # create a dictionary using the search space for the letters (first letters in each case) + letterDictBegin = {} + oldLetter = "" + # there is no change to the first letter, so it must be added separately + letterDictBegin.update({oldLetter: 0}) + letterDictBegin.update({oldLetter.upper(): 0}) + for index, person in enumerate(entries2): + newLetter = person["surnameGiven"][:1].lower() + if newLetter != oldLetter: + letterDictBegin.update({newLetter: index}) # lower case letters + letterDictBegin.update({newLetter.upper(): index}) # upper case letters + oldLetter = newLetter + + # check whether the tables to be compared are identical + if entries1 == entries2: + print("Status: Both tables to be compared are identical") + sameTables = 1 + else: + print("Status: The tables to be compared are different tables") + sameTables = 0 + + # column headings of the standard form + writernames = ["idGlobal", + "idSource1", + "idSource2", + "firstnameGiven", + "firstnameChange", + "sex", + "surnameGiven", + "surnameChange", + "surnameMarriage1", + "surnameMarriage2", + "surnameMarriage3", + "surnameUnknown", + "birthday", + "birthplace", + "birthplaceGOV", + "growthUpPlace", + "growthUpPlaceGOV", + "baptismday", + "baptismplace", + "baptismplaceGOV", + "marriageday1", + "marriageplace1", + "marriageplaceGOV1", + "ageAtMarriage1", + "idSpouse1", + "divorceday1", + "marriageday2", + "marriageplace2", + "marriageplaceGOV2", + "ageAtMarriage2", + "idSpouse2", + "divorceday2", + "marriageday3", + "marriageplace3", + "marriageplaceGOV3", + "ageAtMarriage3", + "idSpouse3", + "divorceday3", + "deathday", + "deathplace", + "deathplaceGOV", + "causeOfDeath", + "martialStatusAtDeath", + "ageAtDeath", + "burialday", + "burialplace", + "burialplaceGOV", + "occupation", + "idFather", + "idMother", + "similarity", + "source"] + + # definition of column names for which information is to be selected from one of the two records + dominantStringList = ["firstnameGiven", + "firstnameChange", + "surnameGiven", + "surnameChange", + "sex", + "surnameMarriage1", + "surnameMarriage2", + "surnameMarriage3", + "surnameUnknown", + "birthday", # dominantly merging dates is pointless unless one of them is only a year + "birthplace", + "birthplaceGOV", + "growthUpPlace", + "growthUpPlaceGOV", + "baptismday", + "baptismplace", + "baptismplaceGOV", + "marriageday1", + "marriageplace1", + "marriageplaceGOV1", + "ageAtMarriage1", + "idSpouse1", + "divorceday1", + "marriageday2", + "marriageplace2", + "marriageplaceGOV2", + "ageAtMarriage2", + "idSpouse2", + "divorceday2", + "marriageday3", + "marriageplace3", + "marriageplaceGOV3", + "ageAtMarriage3", + "idSpouse3", + "divorceday3", + "deathday", + "deathplace", + "deathplaceGOV", + "causeOfDeath", + "martialStatusAtDeath", + "ageAtDeath", + "burialday", + "burialplace", + "burialplaceGOV", + "idFather", + "idMother"] + + # definition of the column names where both information (from both records) should remain + enhanceStringList = ["id", + "source"] + + # definition of the column names where the information is merged but redundant information is deleted + redundanceStringList = ["occupation"] + + # initialize global ID + idGlobal = 0 + idDictList = [] + idDictDict = {} + + # list for remembering the line numbers of the second table merged with lines of the first table + bestMatchIndexList = [] + + # list for remembering the IDs of the inner loop (second list) that were used + innerrowList = [] + # the same is necessary for the outer loop (first list) + rowList = [] + + # iterate first list (outer loop) + for index, row in enumerate(entries1): + # progress output + if index % 100 == 0: + print("Status: Progress of the editing of the first list, " + "{0:3.0f}%".format( + (100 * index) / len(entries1)), time.strftime("%d.%m.%Y %H:%M:%S")) + + # similarities + similarityDictList = [] + innerIdDictList = [] + + # iterate second list (inner loop) + # if two equal lists are compared, the entries should not be compared twice + # here a limitation of the scope of the second list is necessary + if sortingBySurnameGiven == 1: # check only the same initial letters to save computing time + relevantLetter = row["surnameGiven"][:1] + if sameTables == 1: + lastRowOfRelevantLetter = letterDict[relevantLetter] + entriesCache = entries2[index:lastRowOfRelevantLetter] + else: + entriesCache = entries2 + else: + if sameTables == 1: + entriesCache = entries2[index:] + else: + entriesCache = entries2 + + # for namecheck() in the inner iteration the values of the reference persons must be determined + fathers = {} + for fathersRow in entries1: + if fathersRow["id"] == row["idFather"]: + fathers = fathersRow + break # now "fathers" is the father + mothers = {} + for mothersRow in entries1: + if mothersRow["id"] == row["idMother"]: + mothers = mothersRow + break # now "mothers" is the mother + spouse1 = {} + for spouse1Row in entries1: + if spouse1Row["id"] == row["idMother"]: + spouse1 = spouse1Row + break + spouse2 = {} + for spouse2Row in entries1: + if spouse2Row["id"] == row["idMother"]: + spouse2 = spouse2Row + break + spouse3 = {} + for spouse3Row in entries1: + if spouse3Row["id"] == row["idMother"]: + spouse3 = spouse3Row + break + + for innerindex, innerrow in enumerate(entriesCache): + # do not compare records with the same IDs + if row["id"] == innerrow["id"]: + continue + + # the similarity is computed in the namecheck() function and stored in the similarity variable + # each iteration the parameters have new values + # the variables fathers, mothers, spouse1, spouse2, spouse3, entriesCache and entries2 are transferred because the function also checks the plausibility between individual reference persons + # the selection and inclusion of the variables transferred is designed in such a way that as little computational effort as possible is expended + similarity = namecheck(row, innerrow, fathers, mothers, spouse1, spouse2, spouse3, letterDictBegin, + letterDict, entries2) + if similarity != 0 and similarity != 999: + priority = prioritycalc(row, innerrow) + else: + priority = 999 + # proceed only if similarity is not 0 and not 999, priority is necessary + # and the ID must not have been used before + if similarity != 0 and similarity != 999 and priority == 1 and row["id"] not in rowList and row[ + "id"] not in innerrowList and innerrow["id"] not in rowList and innerrow["id"] not in innerrowList: + # count up globalId (new person) + idGlobal = idGlobal + 1 + + # create ID dictionary + # is needed to store the links between the individual IDs + idDict = {"idGlobal": idGlobal, + "idSource1": row["id"], + "idSource2": innerrow["id"], + "idFatherSource1": row["idFather"], + "idFatherSource2": innerrow["idFather"], + "idMotherSource1": row["idMother"], + "idMotherSource2": innerrow["idMother"], + "idSpouse1Source1": row["idSpouse1"], + "idSpouse1Source2": innerrow["idSpouse1"], + "idSpouse2Source1": row["idSpouse2"], + "idSpouse2Source2": innerrow["idSpouse2"], + "idSpouse3Source1": row["idSpouse3"], + "idSpouse3Source2": innerrow["idSpouse3"], + "similarity": similarity, + "priority": priority} + # write ID dictionary to list + innerIdDictList.append(idDict) + # saving the appropriate similarity values + similarityDictList.append({"similarity": similarity}) + + # extend ID dictionary + # for one record of the first list several hits were found in the second list + if len(similarityDictList) != 0: + # select best + bestMatch = 0 + bestMatchIndex = 0 + for counter, comparision in enumerate(similarityDictList): + if comparision["similarity"] > bestMatch and comparision["similarity"] != 999: + bestMatch = comparision["similarity"] + bestMatchIndex = counter + + # prevent the ID from being used again + innerrowList.append(innerIdDictList[bestMatchIndex]["idSource2"]) + rowList.append(innerIdDictList[bestMatchIndex]["idSource1"]) + + idDictList.append([innerIdDictList[bestMatchIndex]]) + # permanently assign the local ID to the global ID + for element in [innerIdDictList[bestMatchIndex]]: + if element["idSource1"] != "": + idDictDict.update({element["idSource1"]: element["idGlobal"]}) + if element["idSource2"] != "": + idDictDict.update({element["idSource2"]: element["idGlobal"]}) + + # if there is no match for the similarity, then use the record of the first list + if len(similarityDictList) == 0 and row["id"] not in rowList and row["id"] not in innerrowList: + # count up globalId (new person) + idGlobal = idGlobal + 1 + + # prevent the ID from being used again + rowList.append(row["id"]) + # id dictionary + idDict = {"idGlobal": idGlobal, + "idSource1": row["id"], + "idSource2": "", + "idFatherSource1": row["idFather"], + "idFatherSource2": "", + "idMotherSource1": row["idMother"], + "idMotherSource2": "", + "idSpouse1Source1": row["idSpouse1"], + "idSpouse1Source2": "", + "idSpouse2Source1": row["idSpouse2"], + "idSpouse2Source2": "", + "idSpouse3Source1": row["idSpouse3"], + "idSpouse3Source2": "", + "similarity": 999, + "priority": 999} + idDictList.append([idDict]) + # permanently assign the local ID to the global ID + if idDict["idSource1"] != "": + idDictDict.update({idDict["idSource1"]: idDict["idGlobal"]}) + + # handle all records of the second list (entries2) that have not been merged yet + # that should never happen when there are two same list (loop runs through without action) + for index, innerrow in enumerate(entries2): + # progress output + if index % 100 == 0: + print("Status: Progress of the editing of the second list, " + "{0:3.0f}%".format( + (100 * index) / len(entries2)), time.strftime("%d.%m.%Y %H:%M:%S")) + + # checking whether the ID has already been marked + # continue only if no hit has been scored yet + if innerrow["id"] not in innerrowList and innerrow["id"] not in rowList: + # count up globalId (new person) + idGlobal = idGlobal + 1 + + innerrowList.append(innerrow["id"]) + + # id dictionary + idDict = {"idGlobal": idGlobal, + "idSource1": "", + "idSource2": innerrow["id"], + "idFatherSource1": "", + "idFatherSource2": innerrow["idFather"], + "idMotherSource1": "", + "idMotherSource2": innerrow["idMother"], + "idSpouse1Source1": "", + "idSpouse1Source2": innerrow["idSpouse1"], + "idSpouse2Source1": "", + "idSpouse2Source2": innerrow["idSpouse2"], + "idSpouse3Source1": "", + "idSpouse3Source2": innerrow["idSpouse3"], + "similarity": 999} + idDictList.append([idDict]) + if idDict["idSource1"] != "": + idDictDict.update({idDict["idSource1"]: idDict["idGlobal"]}) + if idDict["idSource2"] != "": + idDictDict.update({idDict["idSource2"]: idDict["idGlobal"]}) + + # saving the ID keys (idDictList) + with open("idDictList.csv", "w", newline="", encoding="utf-8-sig") as f3: + try: + filewriter = csv.DictWriter(f3, fieldnames=idDictList[0][0].keys(), delimiter="\t") + except IndexError: # then idDictList[0] does not exist + filewriter = csv.DictWriter(f3, fieldnames=idDictList[1][0].keys(), delimiter="\t") + filewriter.writeheader() + for innerIdDictList in idDictList: + filewriter.writerows(innerIdDictList) + + # save the result (tableResult) + # merge and save records + idPrintedList = [] # list of IDs that have already been output so that no ID is output twice + with open("tableResult.csv", "w", newline="", encoding="utf-8-sig") as f2: + filewriter = csv.DictWriter(f2, fieldnames=writernames, delimiter="\t") + # the index-variable creates an indexnumber for each dictionary entry and sorts them as in the csv-file + filewriter.writeheader() + # iterate all records to be merged + for index, innerIdDictList in enumerate(idDictList): + # progress output + if index % 100 == 0: + print("Status: Progress of the output of the linked list, " + "{0:3.0f}%".format( + (100 * index) / len(idDictList)), time.strftime("%d.%m.%Y %H:%M:%S")) + + for i in innerIdDictList: + # retrieve the two records to be merged + try: + idIndexSource1 = next((index for (index, d) in enumerate(entries1) if d["id"] == i["idSource1"]), + None) + row = entries1[idIndexSource1] + except: + pass + try: + idIndexSource2 = next((index for (index, d) in enumerate(entries2) if d["id"] == i["idSource2"]), + None) + innerrow = entries2[idIndexSource2] + except: + pass + + # merging only necessary if two records are present + # otherwise an entry of the present record must be taken over under change of the IDs + + # case 1: only the first one is available + if i["idSource1"] != "" and i["idSource2"] == "": + # here convert the content from the row so that it is printed properly + if type(row["firstnameGiven"]) == list: # distinguish whether first names are stored in list or not + row["firstnameGiven"] = " ".join(row["firstnameGiven"]) + else: + row["firstnameGiven"] = row["firstnameGiven"] + + # IDs must be replaced by global IDs + row.update({"idGlobal": searchIdDict2(idDictDict, row["id"])}) + row.update({"idSource1": row["id"]}) + row.update({"idSpouse1": searchIdDict2(idDictDict, row["idSpouse1"])}) + row.update({"idSpouse2": searchIdDict2(idDictDict, row["idSpouse2"])}) + row.update({"idSpouse3": searchIdDict2(idDictDict, row["idSpouse3"])}) + row.update({"idFather": searchIdDict2(idDictDict, row["idFather"])}) + row.update({"idMother": searchIdDict2(idDictDict, row["idMother"])}) + rowForPrinting = copy.deepcopy(row) # remove only for printing, as it is otherwise still needed + rowForPrinting.pop("id", None) + + # avoiding the output of duplicate IDs + if row["id"] not in idPrintedList: + rowForPrinting["occupation"] = listToString(rowForPrinting["occupation"]) + # convert dates to readable format + convertDatesList = ["birthday", "baptismday", "marriageday1", "marriageday2", "marriageday3", + "deathday", "burialday"] + for convertDatesKey in convertDatesList: + rowForPrinting[convertDatesKey] = datetimeToString(rowForPrinting[convertDatesKey]) + filewriter.writerow(rowForPrinting) + idPrintedList.append(row["id"]) + + # case 2: only the second is present + # only if entries1 != entries2, because otherwise duplicate entries appear (i.e. only for unequal files) + elif i["idSource2"] != "" and i["idSource1"] == "" and entries1 != entries2: + # here is how to convert the content from the row so that it is printed properly + if type(innerrow["firstnameGiven"]) == list: # distinguish whether list or not + innerrow["firstnameGiven"] = " ".join(innerrow["firstnameGiven"]) + else: + innerrow["firstnameGiven"] = innerrow["firstnameGiven"] + # IDs must be replaced by global IDs + innerrow.update({"idGlobal": searchIdDict2(idDictDict, innerrow["id"])}) + innerrow.update({"idSource1": innerrow["id"]}) + innerrow.update({"idSpouse1": searchIdDict2(idDictDict, innerrow["idSpouse1"])}) + innerrow.update({"idSpouse2": searchIdDict2(idDictDict, innerrow["idSpouse2"])}) + innerrow.update({"idSpouse3": searchIdDict2(idDictDict, innerrow["idSpouse3"])}) + innerrow.update({"idFather": searchIdDict2(idDictDict, innerrow["idFather"])}) + innerrow.update({"idMother": searchIdDict2(idDictDict, innerrow["idMother"])}) + innerrowForPrinting = copy.deepcopy( + innerrow) # remove only for printing, as it is otherwise still needed + innerrowForPrinting.pop("id", None) + + # avoiding the output of duplicate IDs + if innerrow["id"] not in idPrintedList: + # convert occupational designation into string + innerrowForPrinting["occupation"] = listToString(innerrowForPrinting["occupation"]) + # convert dates to readable format + convertDatesList = ["birthday", "baptismday", "marriageday1", "marriageday2", "marriageday3", + "deathday", "burialday"] + for convertDatesKey in convertDatesList: + rowForPrinting[convertDatesKey] = datetimeToString(rowForPrinting[convertDatesKey]) + filewriter.writerow(innerrowForPrinting) + idPrintedList.append(innerrow["id"]) + + # case 3: both are present + else: + # avoiding the output of duplicate records + if row["id"] not in idPrintedList and innerrow["id"] not in idPrintedList: + # Merging of similar information + # some info is overwritten, others are added + mergingDict = {"id": "", + "firstnameGiven": "", + "firstnameChange": "", + "sex": "", + "surnameGiven": "", + "surnameChange": "", + "surnameMarriage1": "", + "surnameMarriage2": "", + "surnameMarriage3": "", + "surnameUnknown": "", + "birthday": "", + "birthplace": "", + "birthplaceGOV": "", + "growthUpPlace": "", + "growthUpPlaceGOV": "", + "baptismday": "", + "baptismplace": "", + "baptismplaceGOV": "", + "marriageday1": "", + "marriageplace1": "", + "marriageplaceGOV1": "", + "ageAtMarriage1": "", + "idSpouse1": "", + "divorceday1": "", + "marriageday2": "", + "marriageplace2": "", + "marriageplaceGOV2": "", + "ageAtMarriage2": "", + "idSpouse2": "", + "divorceday2": "", + "marriageday3": "", + "marriageplace3": "", + "marriageplaceGOV3": "", + "ageAtMarriage3": "", + "idSpouse3": "", + "divorceday3": "", + "deathday": "", + "deathplace": "", + "deathplaceGOV": "", + "causeOfDeath": "", + "martialStatusAtDeath": "", + "ageAtDeath": "", + "burialday": "", + "burialplace": "", + "burialplaceGOV": "", + "occupation": "", + "idFather": "", + "idMother": "", + "similarity": ""} + for dominantKey in dominantStringList: + # overwriting is done by examining the length + # this determines the more meaningful (here "longer") information + if type(row[dominantKey]) == int and type(innerrow[dominantKey]) == int: # must be a list + if row[dominantKey] >= innerrow[dominantKey]: + mergingDict[dominantKey] = " ".join(row[dominantKey]) + else: + mergingDict[dominantKey] = " ".join(innerrow[dominantKey]) + elif type(row[dominantKey]) == list and type( + innerrow[dominantKey]) == list: # must be a list + if len(" ".join(row[dominantKey])) >= len(" ".join(innerrow[dominantKey])): + mergingDict[dominantKey] = " ".join(row[dominantKey]) + else: + mergingDict[dominantKey] = " ".join(innerrow[dominantKey]) + elif type(row[dominantKey]) == str and type( + innerrow[dominantKey]) == str: # must be a string + if len(row[dominantKey]) >= len(innerrow[dominantKey]): + mergingDict[dominantKey] = row[dominantKey] + else: + mergingDict[dominantKey] = innerrow[dominantKey] + else: # no identical data types, then simply select the first record + if type(row[dominantKey]) == list: + mergingDict[dominantKey] = " ".join(row[dominantKey]) + else: + mergingDict[dominantKey] = row[dominantKey] + for enhanceKey in enhanceStringList: + # the values can be in lists or not in lists, this must be distinguished for the join function + if type(row[enhanceKey]) == list: + if row[enhanceKey] != "": + if innerrow[enhanceKey] != "": # only separate if there is also something in there + mergingDict[enhanceKey] = ", ".join(row[enhanceKey]) + ", " + ", ".join( + innerrow[enhanceKey]) + else: + mergingDict[enhanceKey] = ", ".join(row[enhanceKey]) + else: + if innerrow[enhanceKey] != "": + mergingDict[enhanceKey] = ", ".join(innerrow[enhanceKey]) + else: + mergingDict[enhanceKey] = "" + else: + mergingDict[enhanceKey] = row[enhanceKey] + ", " + innerrow[enhanceKey] + for redundanceKey in redundanceStringList: + # if information is not in list, split information by separator + if type(row[redundanceKey]) != list: + row[redundanceKey] = row[redundanceKey].split(",") + if type(innerrow[redundanceKey]) != list: + innerrow[redundanceKey] = innerrow[redundanceKey].split(",") + # merge both lists and delete duplicate values + mergingDict[redundanceKey] = row[redundanceKey] + innerrow[redundanceKey] + if mergingDict[redundanceKey] != []: + mergingDict[redundanceKey] = list(set(mergingDict[redundanceKey])) + # convert lists to string + mergingDict[redundanceKey] = listToString(mergingDict[redundanceKey]) + + # search for matching similarity value in idDictList + # this is the value that matches the records to be merged + searchingForId = searchIdDict2(idDictDict, mergingDict["id"]) + # iterate idDictList + for innerIdDictList in idDictList: + # next element if list is empty + if innerIdDictList == []: + continue + # to determine the similarity, the line of the idGlobal value is searched and the similarity is read there + indexId = next( + (index for (index, d) in enumerate(innerIdDictList) if d["idGlobal"] == searchingForId), + None) + # if there is no suitable value in the sub-list, then go on + if indexId == None: + relevantSimilarityValue = 999 + continue + else: + # read similarity + relevantSimilarityValue = innerIdDictList[indexId]["similarity"] + break + + # convert dates to readable format + convertDatesList = ["birthday", "baptismday", "marriageday1", "marriageday2", "marriageday3", + "deathday", "burialday"] + for convertDatesKey in convertDatesList: + mergingDict[convertDatesKey] = datetimeToString(mergingDict[convertDatesKey]) + + # value dictionary + # all matching combinations are now remembered here + # only later, in case of duplications, a selection takes place, which of them is most suitable + dictparams = {"idGlobal": searchIdDict2(idDictDict, mergingDict["id"]), + "idSource1": row["id"], + "idSource2": innerrow["id"], + "firstnameGiven": mergingDict["firstnameGiven"], + "firstnameChange": mergingDict["firstnameChange"], + "sex": mergingDict["sex"], + "surnameGiven": mergingDict["surnameGiven"], + "surnameChange": mergingDict["surnameChange"], + "surnameMarriage1": mergingDict["surnameMarriage1"], + "surnameMarriage2": mergingDict["surnameMarriage2"], + "surnameMarriage3": mergingDict["surnameMarriage3"], + "surnameUnknown": mergingDict["surnameUnknown"], + "birthday": mergingDict["birthday"], + "birthplace": mergingDict["birthplace"], + "birthplaceGOV": mergingDict["birthplaceGOV"], + "growthUpPlace": mergingDict["growthUpPlace"], + "growthUpPlaceGOV": mergingDict["growthUpPlaceGOV"], + "baptismday": mergingDict["baptismday"], + "baptismplace": mergingDict["baptismplace"], + "baptismplaceGOV": mergingDict["baptismplaceGOV"], + "marriageday1": mergingDict["marriageday1"], + "marriageplace1": mergingDict["marriageplace1"], + "marriageplaceGOV1": mergingDict["marriageplaceGOV1"], + "ageAtMarriage1": mergingDict["ageAtMarriage1"], + "idSpouse1": searchIdDict2(idDictDict, mergingDict["idSpouse1"]), + "divorceday1": mergingDict["divorceday1"], + "marriageday2": mergingDict["marriageday2"], + "marriageplace2": mergingDict["marriageplace2"], + "marriageplaceGOV2": mergingDict["marriageplaceGOV2"], + "ageAtMarriage2": mergingDict["ageAtMarriage2"], + "idSpouse2": searchIdDict2(idDictDict, mergingDict["idSpouse2"]), + "divorceday2": mergingDict["divorceday2"], + "marriageday3": mergingDict["marriageday3"], + "marriageplace3": mergingDict["marriageplace3"], + "marriageplaceGOV3": mergingDict["marriageplaceGOV3"], + "ageAtMarriage3": mergingDict["ageAtMarriage3"], + "idSpouse3": searchIdDict2(idDictDict, mergingDict["idSpouse3"]), + "divorceday3": mergingDict["divorceday3"], + "deathday": mergingDict["deathday"], + "deathplace": mergingDict["deathplace"], + "deathplaceGOV": mergingDict["deathplaceGOV"], + "causeOfDeath": mergingDict["causeOfDeath"], + "martialStatusAtDeath": mergingDict["martialStatusAtDeath"], + "ageAtDeath": mergingDict["ageAtDeath"], + "burialday": mergingDict["burialday"], + "burialplace": mergingDict["burialplace"], + "burialplaceGOV": mergingDict["burialplaceGOV"], + "occupation": mergingDict["occupation"], + "idFather": searchIdDict2(idDictDict, mergingDict["idFather"]), + "idMother": searchIdDict2(idDictDict, mergingDict["idMother"]), + "similarity": relevantSimilarityValue, + "source": mergingDict["source"]} + + filewriter.writerow(dictparams) + idPrintedList.append(dictparams["idSource1"]) + idPrintedList.append(dictparams["idSource2"]) + # only one record is in, then the other must be de-merged + elif row["id"] in idPrintedList and innerrow["id"] not in idPrintedList: + # find the number of the line + rowNumber = next((index for (index, d) in enumerate(entries1) if d["id"] == innerrow["id"]), + None) + row = entries1[rowNumber] + # treat record as if only the one element was there + + # here is how to convert the content from the row so that it is printed properly + if type(row["firstnameGiven"]) == list: + row["firstnameGiven"] = " ".join(row["firstnameGiven"]) + else: + row["firstnameGiven"] = row["firstnameGiven"] + # IDs must be replaced by global IDs + # were partly already searched in the similarityDict (don't search twice!) + row.update({"idGlobal": searchIdDict2(idDictDict, innerrow["id"])}) + row.update({"idSource1": row["id"]}) + row.update({"idSpouse1": innerrow["idSpouse1"]}) + row.update({"idSpouse2": innerrow["idSpouse2"]}) + row.update({"idSpouse3": innerrow["idSpouse3"]}) + row.update({"idFather": innerrow["idFather"]}) + row.update({"idMother": innerrow["idMother"]}) + # remove only for printing, as it is otherwise still needed + rowForPrinting = copy.deepcopy(row) + rowForPrinting.pop("id", None) + # avoidance of duplicate records + if row["id"] not in idPrintedList: + rowForPrinting["occupation"] = listToString(rowForPrinting["occupation"]) + filewriter.writerow(rowForPrinting) + idPrintedList.append(row["id"]) + + elif row["id"] not in idPrintedList and innerrow["id"] in idPrintedList: + # find the number of the line + rowNumber = next((index for (index, d) in enumerate(entries1) if d["id"] == row["id"]), None) + row = entries1[rowNumber] + # treat record as if only the one element was there + + # here is how to convert the content from the row so that it is printed properly + if type(row["firstnameGiven"]) == list: # distinguish whether list or not + row["firstnameGiven"] = " ".join(row["firstnameGiven"]) + else: + row["firstnameGiven"] = row["firstnameGiven"] + # IDs must be replaced by global IDs + row.update({"idGlobal": searchIdDict2(idDictDict, row["id"])}) + row.update({"idSource1": row["id"]}) + row.update({"idSpouse1": row["idSpouse1"]}) + row.update({"idSpouse2": row["idSpouse2"]}) + row.update({"idSpouse3": row["idSpouse3"]}) + row.update({"idFather": row["idFather"]}) + row.update({"idMother": row["idMother"]}) + # remove only for printing, as it is otherwise still needed + rowForPrinting = copy.deepcopy(row) + rowForPrinting.pop("id", None) + # convert occupation to string + if row["id"] not in idPrintedList: + # convert occupational designation into string + rowForPrinting["occupation"] = listToString(rowForPrinting["occupation"]) + filewriter.writerow(rowForPrinting) + idPrintedList.append(row["id"]) + else: + print("Status: Was not printed", row["id"], searchIdDict2(idDictDict, row["id"])) + + # remember the merged value, so that it is not printed again at the end + bestMatchIndexList.append(bestMatchIndex) + + print("Status: Program finished") + # stop timer + endTime = time.time() + print("Status: Processing time, {:5.3f}s".format(endTime - startTime)) + + # stop profiler + pr.disable() + pr.print_stats(sort="calls") + + +def listToString(elementList): + """ + This function combines the elements from lists into a string. + :param elementList: list of elements (list) + :return: merged list (string) + """ + if len(elementList) == 0: + elementList = "" + elif len(elementList) == 1: + elementList = elementList[0] + elif len(elementList) >= 1: + elementString = "" + for elementNumber, element in enumerate(elementList): + if element != "": + elementString = elementString + element + # do not add a comma to the last element + if elementNumber != len(elementList) - 1: + elementString = elementString + ", " + elementList = elementString + return (elementList) + + +def juliandate(correctDate): + """ + This function converts classical dates into a Julian date. + :param correctDate: date (datetime) + :return: julian date + """ + time = astropy.time.Time(correctDate) + jd = time.jd + return (jd) + + +def dateBetAndCorrector(datecheck): + """ + This function selects from a date specification in the format "BET ... AND ..." the first date. + :param datecheck: date (string) + :return: date (string) + """ + # date in form "BET ... AND ..." + # select first date + if "BET" in datecheck: + try: + date = datecheck[datecheck.find("BET") + 4:datecheck.find("AND") - 1] + return (date) + except: + return (date) + + +def fileReader(csvfile): + """ + This function reads in the databases as a CSV file. + :param csvfile: name of the CSV file + :return: data from a source (list of dictionaries) + """ + # define delimiter + # start from tab stop + # if the first key has many semicolons, change the separator character + delimiter = "\t" + with open(csvfile, encoding="utf-8-sig") as f0: + filereader = csv.DictReader(f0, delimiter=delimiter) + for line in filereader: + if list(line)[0].count(";"): + delimiter = ";" + break + + # the file is read and stored into the filereader var + with open(csvfile, encoding="utf-8-sig") as f1: + filereader = csv.DictReader(f1, delimiter=delimiter) + entries = [] + for index, line in enumerate(filereader): + # transfer of the information to other variables + try: + line["id"] = line["id"] + if line["id"] == "": # if the variable is empty, then an error must be produced + line["keyDoesNotExist"] + except KeyError: + line.update({"id": str(index)}) # add to variable line + # resolve all the abbreviated first names and split the first names into single names + + try: + line["firstnameGiven"] = klfnameresolver.resolvename(line["firstnameGiven"]) + line["firstnameGiven"] = line["firstnameGiven"].split() + except KeyError: + line.update({"firstnameGiven": ""}) + + try: + line["firstnameChange"] = klfnameresolver.resolvename(line["firstnameChange"]) + line["firstnameChange"] = line["firstnameChange"].split() + except KeyError: + line.update({"firstnameChange": ""}) + + femalepattern = re.compile(r"[A-Za-z\s.]+[ae]$") + try: + line["sex"] = line["sex"] + except KeyError: + if line["firstnameGiven"] != [] and line["firstnameGiven"] != "": + if femalepattern.search(line["firstnameGiven"][0]): + line.update({"sex": "F"}) + else: + line.update({"sex": "M"}) + elif line["firstnameChange"] != [] and line["firstnameChange"] != "": + if femalepattern.search(line["firstnameChange"][0]): + line.update({"sex": "F"}) + else: + line.update({"sex": "M"}) + else: + line.update({"sex": ""}) + + try: + line["surnameGiven"] = line["surnameGiven"] + except KeyError: + line.update({"surnameGiven": ""}) + + try: + line["surnameChange"] = line["surnameChange"] + except KeyError: + line.update({"surnameChange": ""}) + + try: + line["surnameMarriage1"] = line["surnameMarriage1"] + except KeyError: + line.update({"surnameMarriage1": ""}) + + try: + line["surnameMarriage2"] = line["surnameMarriage2"] + except KeyError: + line.update({"surnameMarriage2": ""}) + + try: + line["surnameMarriage3"] = line["surnameMarriage3"] + except KeyError: + line.update({"surnameMarriage3": ""}) + + try: + line["surnameUnknown"] = line["surnameUnknown"] + except KeyError: + line.update({"surnameUnknown": ""}) + + # resolve birth day + try: + convertintodate(line["birthday"]) # just try if it works and do not do it + if convertintodate(line["birthday"]) == "": + int("") # produce error + else: + line["birthday"] = convertintodate(line["birthday"]) + except: + try: + # checking if it is a period of time + line["birthday"] = convertintodate(dateBetAndCorrector(line["birthday"])) + except: + print("Status: The following date could not be converted,", line["birthday"]) + + try: + line["birthplace"] = line["birthplace"] + except KeyError: + line.update({"birthplace": ""}) + + try: + line["birthplaceGOV"] = line["birthplaceGOV"] + except KeyError: + line.update({"birthplaceGOV": ""}) + + try: + line["growthUpPlace"] = line["growthUpPlace"] + except KeyError: + line.update({"growthUpPlace": ""}) + + try: + line["growthUpPlaceGOV"] = line["growthUpPlaceGOV"] + except KeyError: + line.update({"growthUpPlaceGOV": ""}) + + # resolve baptism day + try: + convertintodate(line["baptismday"]) # just try if it works and do not do it + if convertintodate(line["baptismday"]) == "": # produce error + int("") + else: + line["baptismday"] = convertintodate(line["baptismday"]) + except: + try: + # checking if it is a period of time + line["baptismday"] = convertintodate(dateBetAndCorrector(line["baptismday"])) + except: + print("Status: The following date could not be converted,", line["baptismday"]) + + try: + line["baptismplace"] = line["baptismplace"] + except KeyError: + line.update({"baptismplace": ""}) + + try: + line["baptismplaceGOV"] = line["baptismplaceGOV"] + except KeyError: + line.update({"baptismplaceGOV": ""}) + + # resolve marriage day 1 + try: + convertintodate(line["marriageday1"]) + if convertintodate(line["marriageday1"]) == "": # produce error + int("") + else: + line["marriageday1"] = convertintodate(line["marriageday1"]) + except: + try: + # checking if it is a period of time + line["marriageday1"] = convertintodate(dateBetAndCorrector(line["marriageday1"])) + except: + print("Status: The following date could not be converted,", line["marriageday1"]) + + try: + line["marriageplace1"] = line["marriageplace1"] + except KeyError: + line.update({"marriageplace1": ""}) + + try: + line["marriageplaceGOV1"] = line["marriageplaceGOV1"] + except KeyError: + line.update({"marriageplaceGOV1": ""}) + + try: + line["ageAtMarriage1"] = line["ageAtMarriage1"] + except KeyError: + line.update({"ageAtMarriage1": ""}) + + try: + line["idSpouse1"] = line["idSpouse1"] + except KeyError: + line.update({"idSpouse1": ""}) + + try: + line["divorceday1"] = convertintodate(line["divorceday1"]) + except: + try: + line["divorceday1"] = line["divorceday1"] + except KeyError: + line.update({"divorceday1": ""}) + + # resolve marriage day 2 + try: + convertintodate(line["marriageday2"]) + if convertintodate(line["marriageday2"]) == "": # produce error + int("") + else: + line["marriageday2"] = convertintodate(line["marriageday2"]) + except: + try: + # checking if it is a period of time + line["marriageday2"] = convertintodate(dateBetAndCorrector(line["marriageday2"])) + except: + print("Status: The following date could not be converted,", line["marriageday2"]) + + try: + line["marriageplace2"] = line["marriageplace2"] + except KeyError: + line.update({"marriageplace2": ""}) + + try: + line["marriageplaceGOV2"] = line["marriageplaceGOV2"] + except KeyError: + line.update({"marriageplaceGOV2": ""}) + + try: + line["ageAtMarriage2"] = line["ageAtMarriage2"] + except KeyError: + line.update({"ageAtMarriage2": ""}) + + try: + line["idSpouse2"] = line["idSpouse2"] + except KeyError: + line.update({"idSpouse2": ""}) + + try: + line["divorceday2"] = convertintodate(line["divorceday2"]) + except: + try: + convertintodate(line["divorceday2"]) + except: + try: + line["divorceday2"] = line["divorceday2"] + except KeyError: + line.update({"divorceday2": ""}) + + # resolve marriage day 3 + try: + convertintodate(line["marriageday3"]) + if convertintodate(line["marriageday3"]) == "": # produce error + int("") + else: + line["marriageday3"] = convertintodate(line["marriageday3"]) + except: + try: + # checking if it is a period of time + line["marriageday3"] = convertintodate(dateBetAndCorrector(line["marriageday3"])) + except: + print("Status: The following date could not be converted,", line["marriageday3"]) + + try: + line["marriageplace3"] = line["marriageplace3"] + except KeyError: + line.update({"marriageplace3": ""}) + + try: + line["marriageplaceGOV3"] = line["marriageplaceGOV3"] + except KeyError: + line.update({"marriageplaceGOV3": ""}) + + try: + line["ageAtMarriage3"] = line["ageAtMarriage3"] + except KeyError: + line.update({"ageAtMarriage3": ""}) + + try: + line["idSpouse3"] = line["idSpouse3"] + except KeyError: + line.update({"idSpouse3": ""}) + + try: + line["divorceday3"] = convertintodate(line["divorceday3"]) + except: + try: + line["divorceday3"] = line["divorceday3"] + except KeyError: + line.update({"divorceday3": ""}) + + # resolve day of dead + try: + convertintodate(line["deathday"]) + if convertintodate(line["deathday"]) == "": # produce error + int("") + else: + line["deathday"] = convertintodate(line["deathday"]) + except: + try: + # checking if it is a period of time + line["deathday"] = convertintodate(dateBetAndCorrector(line["deathday"])) + except: + print("Status: The following date could not be converted,", line["deathday"]) + + try: + line["deathplace"] = line["deathplace"] + except KeyError: + line.update({"deathplace": ""}) + + try: + line["deathplaceGOV"] = line["deathplaceGOV"] + except KeyError: + line.update({"deathplaceGOV": ""}) + + try: + line["causeOfDeath"] = line["causeOfDeath"] + except KeyError: + line.update({"causeOfDeath": ""}) + + try: + line["martialStatusAtDeath"] = line["martialStatusAtDeath"] + except KeyError: + line.update({"martialStatusAtDeath": ""}) + + try: + line["ageAtDeath"] = line["ageAtDeath"] + except KeyError: + line.update({"ageAtDeath": ""}) + + # resolve funeral day + try: + convertintodate(line["burialday"]) + if convertintodate(line["burialday"]) == "": # produce error + int("") + else: + line["burialday"] = convertintodate(line["burialday"]) + except: + try: + # checking if it is a period of time + line["burialday"] = convertintodate(dateBetAndCorrector(line["burialday"])) + except: + print("Status: The following date could not be converted,", line["burialday"]) + + try: + line["burialplace"] = line["burialplace"] + except KeyError: + line.update({"burialplace": ""}) + + try: + line["burialplaceGOV"] = line["burialplaceGOV"] + except KeyError: + line.update({"burialplaceGOV": ""}) + + # resolve all the jobs + try: + line["occupation"] = occuresolver.resolveroccu(line["occupation"]) + line["occupation"] = line["occupation"].split() + except KeyError: + line.update({"occupation": ""}) + + try: + line["idFather"] = line["idFather"] + except KeyError: + line.update({"idFather": ""}) + + try: + line["idMother"] = line["idMother"] + except KeyError: + line.update({"idMother": ""}) + + # abbreviation expansion should happen here + entries.append(line) + return (entries) + + +def compareDates(a, d): + """ + This function compares two dates. + :param a: first date + :param d: second date + :return: "0", wenn a später ald d ist + """ + if type(a) is not datetime: + a = datetime.strptime(a, "%d.%m.%Y") + if type(d) is not datetime: + d = datetime.strptime(d, "%d.%m.%Y") + if a != "" and d != "": + if a > d: + return 0 + + +def prioritycalc(row, innerrow): + """ + This function checks if different variables are present in both records (row, innerrow). + The namecheck() function also returns 1 if only the names are the same and nothing else is present. + Here, various variable combinations are now tested that increase the probability of a person match - compared to a pure name match. + :param row: first record (dictionary) + :param innerrow: second record (dictionary) + :return: priority (binary) + """ + priority = 0 + # one profession matches + # writing professions in lists + if type(row["occupation"]) == list: + rowOccuList = row["occupation"] + else: + rowOccuList = row["occupation"].split(",") + if type(innerrow["occupation"]) == list: + innerrowOccuList = innerrow["occupation"] + else: + innerrowOccuList = innerrow["occupation"].split(",") + for occuInRow in rowOccuList: + # Skip citizen details ("Bürger") + if occuInRow[-5:] == "ürger": + continue + if occuInRow in innerrowOccuList: + priority = 1 + return (priority) + # marriage date the same (no matter which) + if row["marriageday1"] == innerrow["marriageday1"] and row["marriageday1"] != "" and innerrow["marriageday1"] != "" \ + or row["marriageday1"] == innerrow["marriageday2"] and row["marriageday1"] != "" and innerrow[ + "marriageday2"] != "" \ + or row["marriageday1"] == innerrow["marriageday3"] and row["marriageday1"] != "" and innerrow[ + "marriageday3"] != "" \ + or row["marriageday2"] == innerrow["marriageday1"] and row["marriageday2"] != "" and innerrow[ + "marriageday1"] != "" \ + or row["marriageday2"] == innerrow["marriageday2"] and row["marriageday2"] != "" and innerrow[ + "marriageday2"] != "" \ + or row["marriageday2"] == innerrow["marriageday3"] and row["marriageday2"] != "" and innerrow[ + "marriageday3"] != "" \ + or row["marriageday3"] == innerrow["marriageday1"] and row["marriageday3"] != "" and innerrow[ + "marriageday1"] != "" \ + or row["marriageday3"] == innerrow["marriageday2"] and row["marriageday3"] != "" and innerrow[ + "marriageday2"] != "" \ + or row["marriageday3"] == innerrow["marriageday3"] and row["marriageday3"] != "" and innerrow[ + "marriageday3"] != "": + priority = 1 + return (priority) + # date of birth or date of baptism available for both + if (row["birthday"] != "" or row["baptismday"] != "") and ( + innerrow["birthday"] != "" or innerrow["baptismday"] != ""): + priority = 1 + return (priority) + # date of birth or baptism and date of death or burial available + if (row["birthday"] != "" or row["baptismday"] != "") and ( + innerrow["deathday"] != "" or innerrow["burialday"] != ""): + priority = 1 + return (priority) + elif (innerrow["birthday"] != "" or innerrow["baptismday"] != "") and ( + row["deathday"] != "" or row["burialday"] != ""): + priority = 1 + return (priority) + # date of death or burial available for both + if (row["deathday"] != "" or row["burialday"] != "") and ( + innerrow["deathday"] != "" or innerrow["burialday"] != ""): + priority = 1 + return (priority) + return (priority) + + +def datetimeToString(date): + """ + This function converts dates in datetime format into a readable string format. + :param date: date (datetime) + :return: date (string) + """ + if type(date) == datetime: + date = date.strftime("%d.%m.%Y") + return (date) + + +def namecheck(row, innerrow, fathers, mothers, spouse1, spouse2, spouse3, letterDictBegin, letterDict, entries2): + """ + This function is used to check the names for similarity. + :param row: first row being compared (dictionary) + :param innerrow: second row being compared (dictionary) + :param fathers: row of the father of the person in "row" (dictionary) + :param mothers: row of the mother of the person in "row" (dictionary) + :param spouse1: row of the first spouse of the person in "row" (dictionary) + :param spouse2: row of the second spouse of the person in "row" (dictionary) + :param spouse3: row of the third spouse of the person in "row" (dictionary) + :param letterDictBegin: number of row in entries2 in which a new first letter of surname begins (dictionary) + :param letterDict: number of row in entries2 in which a first letter o surname ends (dictionary) + :param entries2: second list of records (list of dictionaries) + :return: similarity according to Jaro-Winkler (integer or float) + """ + # at the beginning briefly check the names before the 100 other queries are made + # only when this is plausible, do the other things + # if surnameGiven is there, but no surnameUnkown, then overwrite + if row["surnameUnknown"] == "": + row["surnameUnknown"] = row["surnameGiven"] + if innerrow["surnameUnknown"] == "": + innerrow["surnameUnknown"] = innerrow["surnameGiven"] + + # here the disjunction rules begin + + # check if all necessary fields are there // w/o these lines the distance line didn't work after index 0 + if not row["surnameUnknown"] or not row["firstnameGiven"] or not innerrow["surnameUnknown"] \ + or not innerrow["firstnameGiven"]: + return 0 + + # surnames of the persons must be similar + # this has to stay here because the following aspects are computationally intensive and lead to a significant runtime extension + try: + lastnamedist = cachedjarowinkler(row["surnameUnknown"], innerrow["surnameUnknown"]) + except MemoryError: + print("Status: Memory error", row["surnameUnknown"], innerrow["surnameUnknown"]) + if lastnamedist <= 0.95: # at 0.8 it takes too long and the results are not good + return 0 + cachedphoneticsRow = cachedphonetics(row["surnameUnknown"]) + cachedphoneticsInnerrow = cachedphonetics(innerrow["surnameUnknown"]) + if cachedphoneticsRow != cachedphoneticsInnerrow and lastnamedist <= 0.60: + return 0 + + # if a first name is not in one and a first name of the other is not in the other + firstNameNotInSecondName = 0 + for firstFirstname in row["firstnameGiven"]: + if firstFirstname not in innerrow["firstnameGiven"]: + firstNameNotInSecondName = 1 + break + secondNameNotInSecondName = 0 + for secondFirstname in innerrow["firstnameGiven"]: + if secondFirstname not in row["firstnameGiven"]: + secondNameNotInSecondName = 1 + break + if firstNameNotInSecondName == 1 and secondNameNotInSecondName == 1: + return (0) + + # if first names are there but none match, then it is not the same person + sameFirstname = 0 + if row["firstnameGiven"] != "" and innerrow["firstnameGiven"] != "": + # iterate firstnames + for firstFirstname in row["firstnameGiven"]: + for secondFirstname in innerrow["firstnameGiven"]: + if firstFirstname == secondFirstname: + sameFirstname = 1 + break + if sameFirstname == 1: + break # improved runtime if a hit has already been made + if sameFirstname == 0: + return 0 + + # persons must have the same sex + if row["sex"] != innerrow["sex"]: + return 0 + + # persons must not belong to the same source + if row["source"] == innerrow["source"]: + return 0 + + # difference of birthday and deathday of father have to be < 9 months + try: + if row["idFather"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["deathday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["deathday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["deathday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["deathday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + + # difference of baptismday and deathday of father have to be < 9 months + try: + if row["idFather"] != "" and innerrow["baptismday"] != "": + if fathers["id"] == row["idFather"] and fathers["deathday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - fathers["deathday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["baptismday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["deathday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["baptismday"] - fathers["deathday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + + # difference of birthday and burialday of father have to be < 9 months + try: + if row["idFather"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["burialday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["burialday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["deathday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["burialday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + + # difference of baptismday and burialday of father have to be < 9 months + try: + if row["idFather"] != "" and innerrow["baptismday"] != "": + if fathers["id"] == row["idFather"] and fathers["burialday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - fathers["burialday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["baptismday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["baptismday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["burialday"] + if timediff > timedelta(9 * 31): # nine month, 9*31 days + return 0 + except: + pass + + # birthday must happen before deathday of mother (same day allowed) + try: + if row["idMother"] != "" and innerrow["birthday"] != "": + if mothers["id"] == row["idMother"] and mothers["deathday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - mothers["deathday"] + if timediff > timedelta(0): # wenn größer 0; negative values are possible, 0 is also possible + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["birthday"] != "": + for mothers in entries2[letterDictBegin:letterDict]: # search mother in the same file + if mothers["id"] == innerrow["idMother"] and mothers["deathday"] != "": + # same ID and deathday must filled out + timediff = row["birthday"] - mothers["deathday"] + if timediff > timedelta(0): # wenn größer 0; negative values are possible, 0 is also possible + return 0 + except: + pass + + # birthday must happen before burialday of mother + try: + if row["idMother"] != "" and innerrow["birthday"] != "": + if mothers["id"] == row["idMother"] and mothers["deathday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - mothers["deathday"] + if timediff > timedelta(0): # if greater than 0; negative values are possible, 0 is also possible + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["birthday"] != "": + for mothers in entries2[letterDictBegin:letterDict]: # search mother in the same file + if mothers["id"] == innerrow["idMother"] and mothers["burialday"] != "": + # same ID and deathday must filled out + timediff = row["birthday"] - mothers["burialday"] + if timediff > timedelta(0): # if greater than 0; negative values are possible, 0 is also possible + return 0 + except: + pass + + # difference of birthday and birthday of father have to be > 13 years + try: + if row["idFather"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["birthday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["birthday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of baptismday and birthday of father have to be > 13 years + try: + if row["idFather"] != "" and innerrow["baptismday"] != "": + if fathers["id"] == row["idFather"] and fathers["birthday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["baptismday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["birthday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["baptismday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of birthday and baptismday of father have to be > 13 years + try: + if row["idFather"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["baptismday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["baptismday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of baptismday and baptismday of father have to be > 13 years + try: + if row["idFather"] != "" and innerrow["baptismday"] != "": + if fathers["id"] == row["idFather"] and fathers["baptismday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idFather"] != "" and row["baptismday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["baptismday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["baptismday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of birthday and birthday of mother have to be > 13 years + try: + if row["idMother"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["birthday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["birthday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of baptismday and birthday of mother have to be > 13 years + try: + if row["idMother"] != "" and innerrow["baptismday"] != "": + if fathers["id"] == row["idFather"] and fathers["birthday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["baptismday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["birthday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["baptismday"] - fathers["birthday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of birthday and baptismday of mother have to be > 13 years + try: + if row["idMother"] != "" and innerrow["birthday"] != "": + if fathers["id"] == row["idFather"] and fathers["baptismday"] != "": + # same ID and deathday must filled out + timediff = innerrow["birthday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["birthday"] != "": + for fathers in entries2[letterDictBegin:letterDict]: # search father in the same file + if fathers["id"] == innerrow["idFather"] and fathers["baptismday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["birthday"] - fathers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of baptismday and baptismday of mother have to be > 13 years + try: + if row["idMother"] != "" and innerrow["baptismday"] != "": + if mothers["id"] == row["idMother"] and mothers["baptismday"] != "": + # same ID and deathday must filled out + timediff = innerrow["baptismday"] - mothers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + try: + if innerrow["idMother"] != "" and row["baptismday"] != "": + for mothers in entries2[letterDictBegin:letterDict]: # search father in the same file + if mothers["id"] == innerrow["idMother"] and mothers["baptismday"] != "": + # same ID and deathday, birthday must filled out + timediff = row["baptismday"] - mothers["baptismday"] + if timediff < timedelta(13 * 365): + return 0 + except: + pass + + # difference of baptismday and birthday < 3 year (only with historical data!) + try: + timediff = innerrow["baptismday"] - row["birthday"] + if timediff > timedelta(3 * 365) or timediff < timedelta(-3 * 365): + return 0 + except: + pass + try: + timediff = row["baptismday"] - innerrow["birthday"] + if timediff > timedelta(3 * 365) or timediff < timedelta(-3 * 365): + return 0 + except: + pass + + # difference between ageAtMarriage and calculated age between birthday and marriageday1 > 5 year + try: + timediff1 = innerrow["marriageday1"] - innerrow["birthday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtMarriage"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(5 * 365) or (timediff2 - timediff1) < timedelta( + -5 * 365): # 5 years * 365 days, approx. + return 0 + except: + pass + + # difference between ageAtMarriage and calculated age between baptismday and marriageday1 > 5 year + try: + timediff1 = innerrow["marriageday1"] - innerrow["baptismday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtMarriage"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(5 * 365) or (timediff2 - timediff1) < timedelta( + -5 * 365): # 5 years * 365 days, approx. + return 0 + except: + pass + + # calculated age between birthday and marriageday1 < 13 years + try: + if type(innerrow["marriageday1"]) != datetime and len(innerrow["marriageday1"]) == 4: + mday = datetime.strptime("01.01." + innerrow["marriageday1"], "%d.%m.%Y") + else: + mday = innerrow["marriageday1"] + + timediff = mday - row["birthday"] + + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + try: + if type(innerrow["marriageday1"]) != datetime and len(row["marriageday1"]) == 4: + mday = datetime.strptime("01.01." + row["marriageday1"], "%d.%m.%Y") + else: + mday = row["marriageday1"] + + timediff = mday - innerrow["birthday"] + + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + + # calculated age between baptismday and marriageday1 < 13 years + try: + if type(innerrow["marriageday1"]) != datetime and len(row["marriageday1"]) == 4: + mday = datetime.strptime("01.01." + row["marriageday1"], "%d.%m.%Y") + else: + mday = row["marriageday1"] + + timediff = mday - innerrow["baptismday"] + + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + try: + if type(innerrow["marriageday1"]) != datetime and len(innerrow["marriageday1"]) == 4: + mday = datetime.strptime("01.01." + innerrow["marriageday1"], "%d.%m.%Y") + else: + mday = innerrow["marriageday1"] + + timediff = mday - row["baptismday"] + + if timediff < timedelta(13 * 365) and timediff > timedelta(-13 * 365): + return 0 + except: + pass + + # calculated age between birthday and divorceday1 < 13 years + try: + timediff = row["divorceday1"] - innerrow["birthday"] + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + try: + timediff = innerrow["divorceday1"] - row["birthday"] + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + + # calculated age between baptismday and divorceday1 < 13 years + try: + timediff = innerrow["divorceday1"] - row["baptismday"] + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + try: + timediff = row["divorceday1"] - innerrow["baptismday"] + if timedelta(13 * 365) > timediff > timedelta(-13 * 365): + return 0 + except: + pass + + # difference between ageAtDeath and calculated age between birthday and deathday > 10 + try: + timediff1 = innerrow["deathday"] - innerrow["birthday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtDeath"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(10 * 365) or (timediff2 - timediff1) < timedelta( + -10 * 365): # 10 year * 365 days, approx. + return 0 + except: + pass + + # difference between ageAtDeath and calculated age between baptismday and deathday > 10 + try: + timediff1 = innerrow["deathday"] - innerrow["baptismday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtDeath"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(10 * 365) or (timediff2 - timediff1) < timedelta( + -10 * 365): # 10 year * 365 days, approx. + return 0 + except: + pass + + # difference between ageAtDeath and calculated age between birthday and burialday > 10 + try: + timediff1 = innerrow["burialday"] - innerrow["birthday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtDeath"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(10 * 365) or (timediff2 - timediff1) < timedelta( + -10 * 365): # 10 year * 365 days, approx. + return 0 + except: + pass + + # difference between ageAtDeath and calculated age between baptismday and burialday > 10 + try: + timediff1 = innerrow["burialday"] - innerrow["baptismday"] + timediff2 = datetime.strptime("1.1.00" + row["ageAtDeath"], "%d.%m.%Y") - datetime.strptime("01.01.0001", + "%d.%m.%Y") + # works only with ages under 100 years + if (timediff1 - timediff2) > timedelta(10 * 365) or (timediff2 - timediff1) < timedelta( + -10 * 365): # 10 year * 365 days, approx. + return 0 + except: + pass + + # birthday > baptismday + try: + if compareDates(row["birthday"], innerrow["baptismday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["birthday"], row["baptismday"]) == 0: + return 0 + except: + pass + + # birthday > marriageday1 + try: + if compareDates(row["birthday"], innerrow["marriageday1"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["birthday"], row["marriageday1"]) == 0: + return 0 + except: + pass + + # birthday > divorceday1 + try: + if compareDates(row["birthday"], innerrow["divorceday1"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["birthday"], row["divorceday1"]) == 0: + return 0 + except: + pass + + # birthday > deathday + try: + if compareDates(row["birthday"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["birthday"], row["deathday"]) == 0: + return 0 + except: + pass + + # birthday > burialday + try: + if compareDates(row["birthday"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["birthday"], row["burialday"]) == 0: + return 0 + except: + pass + + # baptismday > marriageday1 + try: + if compareDates(row["baptismday"], innerrow["marriageday1"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["baptismday"], row["marriageday1"]) == 0: + return 0 + except: + pass + + # baptismday > divorceday1 + try: + if compareDates(row["baptismday"], innerrow["divorceday1"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["baptismday"], row["divorceday1"]) == 0: + return 0 + except: + pass + + # baptismday > deathday + try: + if compareDates(row["baptismday"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["baptismday"], row["deathday"]) == 0: + return 0 + except: + pass + + # baptismday > burialday + try: + if compareDates(row["baptismday"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["baptismday"], row["burialday"]) == 0: + return 0 + except: + pass + + # marriageday1 > divorceday1 + try: + if compareDates(row["marriageday1"], innerrow["divorceday1"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday1"], row["divorceday1"]) == 0: + return 0 + except: + pass + + # marriageday1 > deathday + try: + if compareDates(row["marriageday1"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday1"], row["deathday"]) == 0: + return 0 + except: + pass + + # marriageday1 > burialday + try: + if compareDates(row["marriageday1"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday1"], row["burialday"]) == 0: + return 0 + except: + pass + + # divorceday1 > deathday + try: + if compareDates(row["divorceday1"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday1"], row["deathday"]) == 0: + return 0 + except: + pass + + # divorceday1 > burialday + try: + if compareDates(row["divorceday1"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday1"], row["burialday"]) == 0: + return 0 + except: + pass + + # marriageday2 > marriageday3 + try: + if compareDates(row["marriageday2"], innerrow["marriageday3"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday2"], row["marriageday3"]) == 0: + return 0 + except: + pass + + # marriageday2 > divorceday2 + try: + if compareDates(row["marriageday2"], innerrow["divorceday2"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday2"], row["divorceday2"]) == 0: + return 0 + except: + pass + + # marriageday2 > deathday + try: + if compareDates(row["marriageday2"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday2"], row["deathday"]) == 0: + return 0 + except: + pass + + # marriageday2 > burialday + try: + if compareDates(row["marriageday2"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday2"], row["burialday"]) == 0: + return 0 + except: + pass + + # marriageday3 > divorceday3 + try: + if compareDates(row["marriageday3"], innerrow["divorceday3"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday3"], row["divorceday3"]) == 0: + return 0 + except: + pass + + # marriageday3 > deathday + try: + if compareDates(row["marriageday3"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday3"], row["deathday"]) == 0: + return 0 + except: + pass + + # marriageday3 > burialday + try: + if compareDates(row["marriageday3"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["marriageday3"], row["burialday"]) == 0: + return 0 + except: + pass + + # divorceday2 > marriageday3 + try: + if compareDates(row["divorceday2"], innerrow["marriageday3"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday2"], row["marriageday3"]) == 0: + return 0 + except: + pass + + # divorceday2 > deathday + try: + if compareDates(row["divorceday2"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday2"], row["deathday"]) == 0: + return 0 + except: + pass + + # divorceday2 > burialday + try: + if compareDates(row["divorceday2"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday2"], row["burialday"]) == 0: + return 0 + except: + pass + + # divorceday3 > deathday + try: + if compareDates(row["divorceday3"], innerrow["deathday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday3"], row["deathday"]) == 0: + return 0 + except: + pass + + # divorceday3 > burialday + try: + if compareDates(row["divorceday3"], innerrow["burialday"]) == 0: + return 0 + except: + pass + try: + if compareDates(innerrow["divorceday3"], row["burialday"]) == 0: + return 0 + except: + pass + + # difference between deathday and burialday > 1 year + try: + timediff = row["burialday"] - innerrow["deathday"] + if timediff > timedelta(1 * 365): + return 0 + except: + pass + try: + timediff = innerrow["burialday"] - row["deathday"] + if timediff > timedelta(1 * 365): + return 0 + except: + pass + + # difference between birthday and deathday > 120 years + try: + timediff = row["deathday"] - innerrow["birthday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + try: + timediff = innerrow["deathday"] - row["birthday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + + # difference between birthday and burialday > 120 years + try: + timediff = row["burialday"] - innerrow["birthday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + try: + timediff = innerrow["burialday"] - row["birthday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + + # difference between baptismday and deathday > 120 years + try: + timediff = row["deathday"] - innerrow["baptismday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + try: + timediff = innerrow["deathday"] - row["baptismday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + + # difference between baptismday and burialday > 120 years + try: + timediff = row["burialday"] - innerrow["baptismday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + try: + timediff = innerrow["burialday"] - row["baptismday"] + if timediff > timedelta(120 * 365): + return 0 + except: + pass + + # difference between birthday and birthday > 1 years + try: + timediff = row["birthday"] - innerrow["birthday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + try: + timediff = innerrow["birthday"] - row["birthday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + + # difference between baptismday and baptismday > 1 years + try: + timediff = row["baptismday"] - innerrow["baptismday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + try: + timediff = innerrow["baptismday"] - row["baptismday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + + # difference between deathday and deathday > 1 years + try: + timediff = row["deathday"] - innerrow["deathday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + try: + timediff = innerrow["deathday"] - row["deathday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + + # difference between burialday and burialday > 1 years + try: + timediff = row["burialday"] - innerrow["burialday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + try: + timediff = innerrow["burialday"] - row["burialday"] + if timediff > timedelta(1 * 365) or timediff < timedelta(-1 * 365): + return 0 + except: + pass + + # If the two dates-of-birth are given and do not match, move on + if row["birthday"] and innerrow["birthday"] and row["birthday"] != innerrow["birthday"]: + return 0 + # if the dates of decease are given, check if they equal + if row["deathday"] and innerrow["deathday"] and row["deathday"] != innerrow["burialday"]: + return 0 + # if the baptism dates are given, check if there is a difference of 3 or more years + if row["baptismday"] and innerrow["baptismday"] and checkdates(row["baptismday"], innerrow["baptismday"]) >= 3: + return 0 + + # marriageday1 < deathday of idSpouse1 + if row["idSpouse1"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse1"]), None) + if compareDates(row["marriageday1"], spouse1["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse1"]), None) + if compareDates(innerrow["marriageday1"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # marriageday2 < deathday of idSpouse2 + if row["idSpouse2"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse2"]), None) + if compareDates(row["marriageday2"], spouse2["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse2"]), None) + if compareDates(innerrow["marriageday2"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # marriageday3 < deathday of idSpouse3 + if row["idSpouse3"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse3"]), None) + if compareDates(row["marriageday3"], spouse3["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse3"]), None) + if compareDates(innerrow["marriageday3"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # divorceday1 < deathday of idSpouse1 + if row["idSpouse1"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse1"]), None) + if compareDates(row["divorceday1"], spouse1["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse1"]), None) + if compareDates(innerrow["divorceday1"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # divorceday2 < deathday of idSpouse2 + if row["idSpouse2"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse2"]), None) + if compareDates(row["divorceday2"], spouse2["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse2"]), None) + if compareDates(innerrow["divorceday2"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # divorceday3 < deathday of idSpouse3 + if row["idSpouse3"] != "": + try: + # rowSpouse = next((index for (index, d) in enumerate(entries1) if d["id"] == row["idSpouse3"]), None) + if compareDates(row["divorceday3"], spouse3["deathday"]) == 0: + return 0 + except: + pass + try: + rowSpouse = next((index for (index, d) in enumerate(entries2) if d["id"] == innerrow["idSpouse3"]), None) + if compareDates(innerrow["divorceday3"], entries2[rowSpouse]["deathday"]) == 0: + return 0 + except: + pass + + # here ends the check whether the persons are disjoint + # in addition, the actual similarity check begins + if lastnamedist > 0.95 or (cachedphoneticsRow != cachedphoneticsInnerrow and lastnamedist > 0.60): + firstnamedist = firstnamescache(row["firstnameGiven"], innerrow["firstnameGiven"]) + if firstnamedist > 0.95 or ( + cachename(row["firstnameGiven"]) == cachename(innerrow["firstnameGiven"]) and firstnamedist > 0.60): + return (lastnamedist + firstnamedist) / 2 + + # no case occurs + return 0 + + +def searchIdDict2(idDictDict, lokalId): + """ + This function assigns IDs from a source to the corresponding global ID of the person. + :param idDictDict: dictinary with global IDs (dictionary) + :param lokalId: ID of a person in the source, or several separated by commas (string) + :return: global ID of the person (string) + """ + + lokalId = str(lokalId) + + # if lokalId is empty, do not return either + if lokalId == "": + return "" + + # if there is a comma in lokalId, then there are multiple IDs available + if "," in lokalId: + lokalIdList = lokalId.split(", ") + + # assumption: the first part belongs to Source 1 and the second to Source 2 + if idDictDict[lokalIdList[0]] == idDictDict[lokalIdList[1]]: + idGlobal = idDictDict[lokalIdList[0]] + return idGlobal + else: + pass + else: + # Search the idGlobal + try: + idGlobal = idDictDict[lokalId] + return (idGlobal) + except: # ID could not be determined + print("Status: Global ID could not be determined for", lokalId) + return ("") + + +def firstnamescache(namelist1, namelist2): + """ + This function calculates the average Jaro-Winkler distance of first names. + :param namelist1: list of first names (list of strings) + :param namelist2: list of first names (list of strings) + :return: average Jaro Winkler distance per first name + """ + counter = 0 + sum = 0 + resultlist = zip(namelist1, namelist2) + for nametuple in resultlist: + sum = sum + cachedjarowinkler(nametuple[0], nametuple[1]) + counter = counter + 1 + result = sum / counter + return result + + +def cachename(namelist): + """ + The function calculates a value from the name components based on Cologne phonetics. + :param namelist: list of names (list of strings) + :return: calculated value for the names (float) + """ + sum = 0 + counter = 0 + for name in namelist: + if cachedphonetics(name) == "": + sum = sum + 0 + else: + sum = sum + int(cachedphonetics(name)) + counter = counter + 1 + result = sum / counter + return result + + +@lru_cache(maxsize=None) +def cachedjarowinkler(word1, word2): + """ + Calculate the Jaro-Winkler similarity, caching the result for higher performance. + :param word1: The first word to compare + :param word2: The second word to compare + :return: The Jaro-Winkler similarity + """ + return distance.get_jaro_distance(word1, word2) + + +@lru_cache(maxsize=None) +def cachedphonetics(name): + """ + Calculates the Cologne phonetics value for the name, caching the result for higher performance. + :name: The name to calculate the pronunciation for + :return: Cologne phonetics code + """ + return kph.encode(name) + + +def checkdates(year1, year2): + """ + This function is used to transform the entries for two years into datetime objects. + :param year1: the first date + :param year2: the second date + :return: the difference between the two dates else 0 + """ + if not year1 or not year2: + return 0 + if type(year2) is not datetime: + enddate = convertintodate(year2) + else: + enddate = year2 + if type(year1) is not datetime: + firstdate = convertintodate(year1) + else: + firstdate = year1 + if enddate == "" or firstdate == "": + return 0 + datedifference = rd.relativedelta(enddate, firstdate).years + if datedifference: + return datedifference + else: + return 0 + + +def youngchild(row, innerrow): + """ + This function checks if a person's date of death is more than 13? years after the date of baptism or date of birth. + It returns 0 if the person was a stillborn or didn't reach a specific age (13 by the time of development) + otherwise it returns 1 + :param row: the row that is being iterated + :param innerrow: the inner row that is being iterated + :return: 1 if the person was no stillborn or died at young age, else 0 + """ + if row["firstnameGiven"] == "Knabe" or row["firstnameGiven"] == "Mädchen" or row["firstnameGiven"] == "+ Mädchen": + return 0 + if row["deathday"] and row["birthday"] and innerrow["deathday"] and innerrow["birthday"]: + if row["deathday"] == row["birthday"] or innerrow["deathday"] == innerrow["birthday"]: + return 0 + if row["baptismday"] and row["deathday"] and innerrow["baptismday"] and innerrow["deathday"]: + if row["baptismday"] == row["deathday"]: + return 0 + if row["deathday"] and row["birthday"]: + if not checkdates(row["birthday"], row["deathday"]) or checkdates(row["birthday"], row["deathday"]) <= 13: + return 0 + if row["deathday"] and row["baptismday"]: + if not checkdates(row["baptismday"], row["deathday"]) or \ + checkdates(row["baptismday"], row["deathday"]) <= 13: + return 0 + else: + return 1 + + +def married(date1, date2): + """ + The function checks if a person is married + :param date1: the first wedding date + :param date2: the second wedding date + :return: 1 if the wedding dates are equal else 0 + """ + if date1 and date2: + if checkdates(date1, date2) == 0: + return 1 + else: + return 0 + + +def convertintodate(year): + """ + This function transforms the entry from the csv-file into a datetime.date object + :param year: date (string) + :return: datetime.date object else 0 + """ + if not year: + return ("") + + year = year.replace(" JAN ", ".01.") + year = year.replace(" FEB ", ".02.") + year = year.replace(" MAR ", ".03.") + year = year.replace(" APR ", ".04.") + year = year.replace(" MAY ", ".05.") + year = year.replace(" JUN ", ".06.") + year = year.replace(" JUL ", ".07.") + year = year.replace(" AUG ", ".08.") + year = year.replace(" SEP ", ".09.") + year = year.replace(" OCT ", ".10.") + year = year.replace(" NOV ", ".11.") + year = year.replace(" DEC ", ".12.") + try: + yeardate = datetime.strptime(year, "%d.%m.%Y") + except: + # "BET ... AND ..." dates + if "BET" not in yeardate: + return ("") + else: + yeardate = year + return yeardate + + +def createResultTable(resulttablefile): + """ + This funtion is used to erase entries from the result table + that do not contain entries for idSource1 and idSource2 + :param resulttablefile: the result table file + :return: returnlist: list that contains the purged entries + """ + returnlist = [] + with open(resulttablefile, encoding="utf-8-sig") as file1: + filereader = csv.DictReader(file1, delimiter="\t") + for line in filereader: + if line["idSource2"] != "": + returnlist.append(line) + return returnlist + + +if __name__ == "__main__": + main()