diff --git a/2022_005_goldberg/Skripte/Hauptprogramm/main.py b/2022_005_goldberg/Skripte/Hauptprogramm/main.py new file mode 100644 index 0000000000000000000000000000000000000000..349287003e173101cd427d49d7188dccd8f58685 --- /dev/null +++ b/2022_005_goldberg/Skripte/Hauptprogramm/main.py @@ -0,0 +1,1272 @@ +import csv +import os.path +from multiprocessing import Pool, current_process +from functools import partial +import time +import json +import Levenshtein +import copy + + +def loadData(filename, delimiter, encoding): + """ + This function is used to open files in which data is temporarily stored and was created by the program in a previous run. + :param filename: designation of the file + :param delimiter: type of delimiter as string + :return: list of dictionaries with information of the file to be loaded + """ + content = [] # list oft dicts + try: + with open(filename, "r", encoding=encoding) as data: # , errors='ignore' + for i in csv.DictReader(data, delimiter=delimiter): + i = json.loads(json.dumps(i)) + content.append(i) + except FileNotFoundError: + print("Status: Inital pass for file", filename, "(no list created yet).") + return (content) + + +def appendFile(filename, data, fieldnames): + """ + This function describes CSV files. + :param filename: designation of the file (string) + :param data: type of delimiter (string) + :param fieldnames: column names of the file to be written (list of strings) + """ + opener = open(filename, "a", newline='', encoding="utf-8") + writer = csv.DictWriter(opener, fieldnames=fieldnames, delimiter="\t") + # differentiation of cases where one or more lines are to be added + # for the file "qualityofgedcom.csv" only one line should be written at a time + # for all other files several lines should be written + if filename == "qualityofgedcom.csv": + writer.writerow(data) + else: + writer.writerows(data) + opener.close() + + +def createFile(filename, fieldnames, delimiter, encoding): + """ + This function creates a new file if no file already exists under this name. + The function is also used to load data when it is clear that the file already exists. + :param filename: designation of the file (string) + :param fieldnames: column names of the file to be written (list of strings) + :param delimiter: type of delimiter (string) + :return: list of dictionaries with information of the file to be loaded + """ + content = loadData(filename, delimiter, encoding) + # create a new file if it is not there + if len(content) == 0: # check if the variable does not contain any data + opener = open(filename, "w", newline='', encoding="utf-8-sig") + writer = csv.writer(opener, delimiter=delimiter) + writer.writerow(fieldnames) + opener.close() + return (content) + + +def loadGedcomFile(filename): + """ + This function loads the data of a GEDCOM file and writes them line by line into a list. + :param filename: name of the file + :return: in case of error "NONE", otherwise a list with the information of the GEDCOM file + """ + # define file path + filepath = os.path.join("data", filename) + preparedData = [] + try: + gedcom = open(filepath, "r", encoding="utf-8") + data = gedcom.readline() + # initial transfer of the headline + data = data[:-1] # delete the unimportant last character of each line + while data != "": # last line is empty + data = str(gedcom.readline()) + data = data[:-1] # delete the unimportant last character of each line + preparedData.append(data) + gedcom.close() + return (preparedData) + except FileNotFoundError: + print("Error: There is a problem with access to the file", filename, ".") + return ("NONE") + + +def separator(occu, replaced, replacer): + """ + This function is used to replace separation operators. + :param occu: string that is processed + :param replaced: content to be replaced + :param replacer: place of the one to be replaced + :return: new string with changed content + """ + if replaced in occu: + occu = occu.replace(replaced, replacer) + return (occu) + + +def endOfString(phrase, signalWord): + """ + This function is used to detect the position of an element of a string. + The respective end position of a part is determined, if it exists. + Everything before this position is removed. + :param phrase: string to be searched (string) + :param signalWord: displays a place name (string) + :return: text after the end position of the signal word in the phrase + """ + # if phrase contains the signal word, then find end position of the signal word and remove everything behind + if signalWord in phrase: + endOfString = phrase[(phrase.find(signalWord) + len(signalWord)):] + return (endOfString) + return ("") + + +def replaceLoc(signalWord, phrase, loc): + """ + This function is used to store location names. + :param signalWord: displays a place name (string) + :param phrase: string to be searched (string) + :param loc: designation of a place (string) + :return: adjusted occupation phrase + """ + if signalWord in phrase: + phrase = phrase.replace(signalWord, "") # remove "signalWord" + phrase = phrase.replace(loc, "") # remote location + return (phrase) + + +def dictSearch(relevantDict, key, relevantObject): + """ + This function searches a given list of dictionaries for a searched value and specifies the key. + :param relevantDict: list of dictionaries that will be searched + :param key: key of the dictionary to be studied + :param relevantObject: name of the value to be searched for under the key in the Dictionary + :return: number of the searched dictionary in the list (if none is found "-1") + """ + # search per list comprehension + # note: upper and lower case is relevant here + occuIndex = next((index for (index, d) in enumerate(relevantDict) if d[key] == relevantObject), None) + if occuIndex is None: + return (-1) # if it could not be found + return (occuIndex) + + +def partCorrector(phrase, existingVariantsKldB): + """ + This function cleans up a location specification. + Information that is not related to the location will be filtered out. + In addition, an attempt is made to find a lemma for this occupation. + :param phrase: occupation (string) + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :return: information about the occupation (dictionary) + """ + # initialization of variables, so that exist + titel = "" + role = "" + year = "" + url = "" + brackets = "" + + # step 5: geographic prepositions ("loc" stands for location) + # find and save place names + # place name is currently overwritten if several of them occur + # there are signal words that indicate a subsequent location (e.g. "in", "im") + loc = endOfString(phrase, " in ") # "loc" is needed for the upcoming function + phrase = replaceLoc(" in ", phrase, loc) + loc = endOfString(phrase, " im ") + phrase = replaceLoc(" im ", phrase, loc) + loc = endOfString(phrase, " In ") + phrase = replaceLoc(" In ", phrase, loc) + loc = endOfString(phrase, " i. ") + phrase = replaceLoc(" i. ", phrase, loc) + loc = endOfString(phrase, " von ") + phrase = replaceLoc(" von ", phrase, loc) + loc = endOfString(phrase, " v. ") + phrase = replaceLoc(" v. ", phrase, loc) + loc = endOfString(phrase, " zu ") + phrase = replaceLoc(" zu ", phrase, loc) + loc = endOfString(phrase, " auf ") + phrase = replaceLoc(" auf ", phrase, loc) + loc = endOfString(phrase, " aus ") + phrase = replaceLoc(" aus ", phrase, loc) + loc = endOfString(phrase, " Aus ") + phrase = replaceLoc(" Aus ", phrase, loc) + loc = endOfString(phrase, " an ") + phrase = replaceLoc(" an ", phrase, loc) + loc = endOfString(phrase, " der ") + phrase = replaceLoc(" der ", phrase, loc) + loc = endOfString(phrase, " des ") + phrase = replaceLoc(" des ", phrase, loc) + loc = endOfString(phrase, " van ") + phrase = replaceLoc(" van ", phrase, loc) + + # besides location information there are signal words for employers + # "loc" continues to be used here, even though the literal sense no longer fits here + loc = endOfString(phrase, " bei ", ) + phrase = replaceLoc(" bei ", phrase, loc) + loc = endOfString(phrase, " bei dem ") + phrase = replaceLoc(" bei dem ", phrase, loc) + loc = endOfString(phrase, " beim ") + phrase = replaceLoc(" beim ", phrase, loc) + loc = endOfString(phrase, " bei der ") + phrase = replaceLoc(" bei der ", phrase, loc) + + # then there are signal words in front of an occupation, which makes clear the affiliation to a dominion + affiliation = ["herrschaftlich", "herrschaftliche", "herrschaftlicher", "königlich", "königliche", "königlicher", + "fürstlich", "fürstliche", "fürstlicher"] + for i in affiliation: + if i in phrase: + # this information should not be deleted from the occupation statement + # it should only be stored in "loc" to be output separately afterwards + # if "loc" is empty, then no comma should precede it + if loc != "": + loc = loc + ", " + i + else: + loc = i + + # find and save years + # more detailed dates are made to year information + # assumption: Year numbers always have four digits and are at the beginning + # check if the first character is a number + if phrase[:1].isdigit() is True: + # check if the first four characters are a number + if phrase[:4].isdigit() is True: + # separate year and part behind + year = phrase[:4] + phrase = phrase[4:] + + # brackets content + if "(" in phrase and ")" in phrase: + brackets = phrase[phrase.find("("):phrase.find(")")] + phrase = phrase[:phrase.find("(")] + phrase[phrase.find(")") + 2:] # +2 because of parenthesis and space + if "[" in phrase and "]" in phrase: + brackets = phrase[phrase.find("["):phrase.find("]")] + phrase = phrase[:phrase.find("[")] + phrase[phrase.find("]") + 2:] # +2 because of parenthesis and space + + # find and save URLs + # example: <a href="https:undde.wikipedia.org/wiki/Geschichte_des_Kantons_Thurgau#Grafen_im_Thurgau">Graf im Thurgau</a> + if "<a" in phrase and "</a>" in phrase: + url = phrase[phrase.find("<a"):phrase.find("</a>")] + phrase = phrase[:phrase.find("<a")] + phrase[phrase.find("</a>"):] + + # find and save role + # wife + if "F. d." in phrase: + role = "Frau" + phrase = endOfString(phrase, "F. d.") + if "Ehefrau des" in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau des") + if "Ehefrau d." in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau d.") + if "Ehefrau" in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau") + if "frau" in phrase and "Haus" != phrase[:4] and "Acker" != phrase[:5]: + role = "Frau" + phrase = phrase.replace("sfrau", "") + phrase = phrase.replace("frau", "") + # daugther + if "T. d." in phrase: + role = "Tochter" + phrase = endOfString(phrase, "T. d.") + if "tochter" in phrase: + role = "Tochter" + phrase = phrase.replace("stochter", "") + phrase = phrase.replace("tochter", "") + # son + if "S. d." in phrase: + role = "Sohn" + phrase = endOfString(phrase, "S. d.") + if "sohn" in phrase: + role = "Sohn" + phrase = phrase.replace("ssohn", "") + phrase = phrase.replace("sohn", "") + + # find and save titles + if "Prof." in phrase: + titel = "Professor" + phrase = endOfString(phrase, "Prof.") + if "Professor" in phrase: + titel = "Professor" + phrase = endOfString(phrase, "Professor") + + # step 9: temporal prepositions and numerals + if " am " in phrase: + year = endOfString(phrase, " am ") + phrase = phrase.replace(" am ", "") + phrase = phrase.replace(year, "") + if " bis " in phrase: + year = endOfString(phrase, " bis ") + phrase = phrase.replace(" bis ", "") + phrase = phrase.replace(year, "") + + # delete numbers, unless they end with a dot or there are 4 consecutive digits, then this is taken as year + numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] + numberLength = 0 + prePart = phrase + for i in range(len(phrase)): + if prePart[i:i + 1] in numbers: + numberLength = numberLength + 1 + if prePart[i + 1:i + 2] != "." and prePart[i + 1:i + 2] not in numbers: + if numberLength == 4: + year = prePart[i - 3:i + 1] + phrase = phrase.replace(year, "") + numberLength = 0 + else: + phrase = phrase.replace(phrase[i - numberLength + 1:i + 1], "") + numberLength = 0 + elif phrase[i + 1:i + 2] == ".": + numberLength = 0 + + # remove remaining special characters + phrase = phrase.replace(":", "") + + # remove blanks here again + # "cleanedOccupation" is what remains of the occupation specification + cleanedOccupation = phrase.strip() + + # search if there is a corresponding pedant in the already classified occupational data + occuIndex = dictSearch(existingVariantsKldB, "variant", cleanedOccupation) + # if occuIndex is not "-1", then a counterpart was found + if occuIndex != -1: + # KldB identifier + kldb = existingVariantsKldB[occuIndex]["code"] + # way of selection of a counterpart + info = "found direct" + # levDict stands in front of Levenshtein dictionary + # name not appropriate here, because no Levenshtein distance is used + # for uniformity of the variable it is used anyway + levDict = {"lemma row": occuIndex, # line of the matching dictionary + "variant": "", + "best fit lemma": existingVariantsKldB[occuIndex]["variant"], + # designation of the appropriate occupation + "absolute distance": "", + "relative distance": "", + "selection": "" + } + # if occuIndex is "-1", no counterpart was found and a similarity analysis starts + elif occuIndex == -1 and cleanedOccupation != "": # cleanedOccupation must not be empty + # similarity analysis + levDict = levenshteinDist(existingVariantsKldB, "variant", cleanedOccupation, "code") + # setting the relative Levenshtein distance of 0.25 as the essential threshold for selection + if levDict["relative distance"] < 0.25: + levDict.update({"selection": 1}) + kldb = existingVariantsKldB[levDict["lemma row"]]["code"] # take the line here from the levDict + # way of selection of a counterpart + info = "found after levenshtein" + else: + # no counterpart found + levDict.update({"selection": 0}) + kldb = "" + info = "not found" + # no occupation remains + else: + kldb = "" + info = "no occupational designation" + levDict = {"lemma row": "", "variant": "", "best fit lemma": "", "absolute distance": "", + "relative distance": "", "selection": ""} + + # store the information sorted for each phrase (occupation) + occupationResult = { + "occupation": cleanedOccupation, + "best fit lemma": levDict["best fit lemma"], + "row of best fit lemma": levDict["lemma row"], + "KldB 2010": kldb, + "titel": titel, + "role": role, + "location": loc, + "year": year, + "url": url, + "further info": brackets, + "selection info": info, + "similarity analysis": levDict, + "lemma row": levDict["lemma row"], + "absolute distance": levDict["absolute distance"], + "relative distance": levDict["relative distance"] + } + return (occupationResult) + + +def abbreviationsCorrector(firstString, secondString): + """ + This function compares two phrases and checks if one of them could be an abbreviation of the other. + If "s"econdString" is an abbreviation of "firstString", "firstString" will be returned truncated. + :param firstString: first phrase without abbreviation (string) + :param secondString: second phrase with abbreviation (string) + :return: resolved abbreviation of "firstString" (string) + """ + # continue only if there is a dot in "secondString" + # first letters equal to runtime improvement + if "." in secondString and secondString[:1] == firstString[:1]: + positionDot = secondString.find(".") + # find the abbreviated part in the other string and delete it in the original name + # count backwards to find blanks + for position in range(positionDot, 0, -1): + if secondString[positionDot:positionDot + 1] == " ": + beforeDot = secondString[position:positionDot] + break; + elif position == 1: + beforeDot = secondString[:positionDot] + + # testing minimum length + try: + # minimum length before 3 letters + if positionDot - position < 4: + # if less than three letters, return original value + return (firstString) + except UnboundLocalError: + position = 0 + beforeDot = secondString[position:positionDot] + # minimum length before 3 letters + if positionDot - position < 4: + # if less than three letters, return original value + return (firstString) + + if beforeDot in firstString: + positionPart = firstString.find(beforeDot) + len(beforeDot) + for position in range(positionPart, len(firstString) + 1): + # blank, hyphen or general end; +1 is allowed here, is then simply empty + if firstString[position:position + 1] == " " or firstString[ + position:position + 1] == "-" or position == len( + firstString): + positionEnd = position + break; + # abbreviation found, abbreviate original name + firstString = firstString[:positionPart] + ". " + firstString[positionEnd:] + return (firstString) + + +def levenshteinDist(existingVariantsKldB, key, relevantObject, keyRelevantDict): + """ + This function generates the Levenshtein distance between two strings. + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :param key: designation of the key for the "relevantDict" (string) + :param relevantObject: occupation for which a similar, already classified value is to be found (string) + :param keyRelevantDict: name of the column that contains for the identifier (string) + :return: information on similarity analysis (dictionary) + """ + # the best fitting value is to be found + # initial high values for a Levenshtein distance, which are undercut in any case + minimalDistAbs = 99999 # absolute + minimalDistRel = 99999 # relative + # binary variable, 0 if no hit was found, 1 if at least one hit was found + minOneFound = 0 + # check against each existing entry + for counter, i in enumerate(existingVariantsKldB): + # Lower case for better comparability + relevantObjectLowerCase = relevantObject.lower() + existingVariantLowerCase = copy.copy( + i[key]).lower() # copy important because it is changed afterwards + # compare only if first letters are the same (serves to improve runtime) + if existingVariantLowerCase[:1] == relevantObjectLowerCase[:1]: + # calculate Levenshtein distance + levDistAbs = Levenshtein.distance(existingVariantLowerCase, relevantObjectLowerCase) + # levDist multiply with number of blanks (+1) to avoid "gewesener königlicher Richter"/"gewesener königlicher Koch" + levDistRel = levDistAbs * (relevantObject.count(" ") + 1) / len(relevantObject) + # when the next one fits better + if levDistRel < minimalDistRel: + minimalDistAbs = levDistAbs + minimalDistRel = levDistRel + bestFitLemma = i[key] + cacheCounter = counter + # is overwritten until an equal one comes along + hitlist = [[i[key], cacheCounter]] + # if the next one fits equally well + if levDistRel == minimalDistRel: + hitlist.append([i[key], counter]) + # at least one hit + minOneFound = 1 + # no similarity + else: + continue; + + # select one in case of multiple hits + # selection is made by greatest match from the front (matching letters) + try: + # if there were several hits of the same quality + # anything above 0.25 is assumed to be unrealistic here, serves to improve runtime + if len(hitlist) > 1 and minimalDistRel < 0.25: + # initialization of counters + numberMatchingChars = 0 + maxNumberMatchingChars = 0 + numberMatchingCharsList = [] + for charPosition, j in enumerate(hitlist): + # if the respective letters of the strings to be compared are the same + if j[0][charPosition:charPosition + 1] == relevantObject[charPosition:charPosition + 1]: + # count up + numberMatchingChars = numberMatchingChars + 1 + # note the maximum number of characters + maxNumberMatchingChars = numberMatchingChars + # reset, if another character comes + else: + numberMatchingChars = 0 + numberMatchingCharsList.append([charPosition, maxNumberMatchingChars]) + + # Selection of the result with the closest match (no longer has anything to do with Levenshtein distance) + longestMatch = 0 + # iterate all results of the maxNumberMatchingCharsList + for j in numberMatchingCharsList: + # select so most suitable + if j[1] > longestMatch: # [1] is maxNumberMatchingChars + longestMatch = j[1] + charPosition = j[0] # [0] is charPosition + # there can be best results for the same time + # that is ignored at this point + # only one status message is issued + # the second, equally matching value, is not selected + # if j[1] == longestMatch: + # this may be due to the fact that equal values are compared + # duplicates exist in the list of already classified occupational data + # therefore values to be compared can be the same + # if hitlist[j[0]][0] == hitlist[charPosition][0]: + # print("Status: A dublette exists in the list of possible hits(" + hitlist[j[0]][0] + ", " + hitlist[charPosition][0] + ")") + # continue + # but the values do not always have to be the same, they can also just have the same beginning + # print("Status: Two very similar values exist in the list of possible hits(" + hitlist[j[0]][0] + ", " + relevantObject + ")") + + # overwrite the relevant variables + bestFitLemma = hitlist[charPosition][0] + cacheCounter = hitlist[charPosition][1] + except UnboundLocalError: + pass; + + # alternative, if the possibility above did not lead to success + # this may be due to the fact that abbreviations are included + if minimalDistRel >= 0.25: + # search for abbreviations marked with a dot + for counter, i in enumerate(existingVariantsKldB): + designationCopy = relevantObject.lower() + originalDesignation = copy.copy(i[key]).lower() # copy important because it is changed afterwards + # only if first letters are equal (runtime improvement) + if originalDesignation[:1] == designationCopy[:1]: + # abbreviation handling + preDesignationCopy = designationCopy # save previous value + designationCopy = abbreviationsCorrector(designationCopy, originalDesignation) + if designationCopy == preDesignationCopy: + # the same again the other way around + originalDesignation = abbreviationsCorrector(originalDesignation, designationCopy) + levDist = Levenshtein.distance(originalDesignation, designationCopy) + if levDist < minimalDistAbs: # minimalDistRel + minimalDistAbs = levDist # minimalDistRel + # if the new value is smaller, then overwrite relevant variables + bestFitLemma = i[key] + cacheCounter = counter + # at least one hit + minOneFound = 1 + + if minOneFound == 0: + bestFitLemma = "nothing" # occurs, if e.g. the first letter is a colon; there is no variant to + cacheCounter = -1 + # merge information + levenDict = { + "lemma row": cacheCounter, + "variant": relevantObject, + "best fit lemma": bestFitLemma, + "absolute distance": minimalDistAbs, + "relative distance": minimalDistRel + } + return (levenDict) + + +def occuCleaner(occu, existingVariantsKldB): + """ + This function cleans up individual occupation information. + It is also essential that various information is separated from the original job title. + This can concern several job titles, but also non-professional information. + :param occu: occupational title + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :return: information about the different occupational indications in the original indication (dictionary) + """ + + # storage of the original occupational title + originalOccu = occu + + # print(occu) + + # initialization + # "occu1" does not need to be initialized because there is at least one occupation specification + occu2 = {} # "" + occu3 = {} # "" + occu4 = {} # "" + occu5 = {} # "" + + # initialization + part1 = "" + part2 = "" + part3 = "" + part4 = "" + part5 = "" + + # general preprocessing + + # step 1: Remove spaces at the beginning and end + occu = occu.strip() + + # step 2: Write out abbreviations + if "mstr." in occu: + occu = occu.replace("mstr.", "meister") + if "Ing." in occu: + occu = occu.replace("Ing.", "Ingenieur") + + # step 3: Normalize separation operators + occu = separator(occu, " u.", " und") + occu = separator(occu, "+", " und ") # there are also "und" (and) without spaces + occu = separator(occu, ", ", " und ") + occu = separator(occu, ",", " und ") + occu = separator(occu, "; ", " und ") + occu = separator(occu, " & ", " und ") + occu = separator(occu, " / ", " und ") + occu = separator(occu, "/", " und ") + + # detail processing + + # separate multiple occupations + partList = [part1, part2, part3, part4, part5] # parts are still all empty here + partCounter = 0 + separation = " und " + partList[0] = occu # is needed for initialization because the while loop accesses the next one + # < 4, because not infinite parts should be made + while separation in partList[partCounter] and partCounter < 4: + st = partList[partCounter] + # exeptation: do not seperate when "-" before "und", f. e. "Kauf- und Handelsmann", or in "k. u. k." + if "- und " not in st and "k. und k." not in st: + partList[partCounter] = st[:st.find(" und ")] # first part + partList[partCounter + 1] = st[(st.find(" und ") + len(" und ")):] # second part + partCounter = partCounter + 1 + + # write back values from the partList + part1 = partList[0] + part2 = partList[1] + part3 = partList[2] + part4 = partList[3] + part5 = partList[4] + + if partCounter == 0: # if there is only one part + part1 = occu + + # the content of the individual professional data is added to the dictionary afterwards + # only fill in if there is really content there + occu1 = partCorrector(part1, existingVariantsKldB) + if part2 != "": # if there is no part2, then just keep going + occu2 = partCorrector(part2, existingVariantsKldB) + if part3 != "": # can only be if there was a part2 beforehand + occu3 = partCorrector(part3, existingVariantsKldB) + if part4 != "": + occu4 = partCorrector(part4, existingVariantsKldB) + if part5 != "": + occu5 = partCorrector(part5, existingVariantsKldB) + + # information about the different occupational indications in the original indication + occuDictOfDicts = { + "variant": originalOccu, + "occupation 1": occu1, # occu1 is a dictionary with occupation information + "occupation 2": occu2, + "occupation 3": occu3, + "occupation 4": occu4, + "occupation 5": occu5 + } + + return (occuDictOfDicts) + + +def statistics(occuList, occuKeys): + """ + This function counts the number of lemmatizations over the different process branches. + :param occuList: list of dictionaries with information to analysed occupational information + :param occuKeys: column headings for the analysis of separated occupations + """ + # initialization of counters + counter = 0 # found directly in existing variants + counter0 = 0 # empty occupational designations (only came about as a result of cleanup, e.g. because only location information was given) + counter2 = 0 # found by Levenshtein distance + counter3 = 0 # could not be found + counter4 = 0 # found by Levenshtein distance NV + counter5 = 0 # found directly in existing variants NV + + for i in occuList: + try: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + if i == [] or i[key] == {}: + continue; + elif i[key]["selection info"] == "found direct": + counter = counter + i["number"] + elif i[key]["selection info"] == "found after levenshtein": + counter2 = counter2 + i["number"] + elif i[key]["selection info"] == "not found": + counter3 = counter3 + i["number"] + elif i[key]["selection info"] == "no occupational designation": + counter0 = counter0 + i["number"] + elif i[key]["selection info"] == "found after levenshtein NV": + counter4 = counter4 + i["number"] + elif i[key]["selection info"] == "found direct NV": + counter5 = counter5 + i["number"] + else: + print("Error: Selection information is missing.") + except: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + if i == [] or i[0][key] == {}: + continue; + elif i[0][key]["selection info"] == "found direct": + counter = counter + i[0]["number"] + elif i[0][key]["selection info"] == "found after levenshtein": + counter2 = counter2 + i[0]["number"] + elif i[0][key]["selection info"] == "not found": + counter3 = counter3 + i[0]["number"] + elif i[0][key]["selection info"] == "no occupational designation": + counter0 = counter0 + i[0]["number"] + elif i[0][key]["selection info"] == "found after levenshtein NV": + counter4 = counter4 + i[0]["number"] + elif i[0][key]["selection info"] == "found direct NV": + counter5 = counter5 + i[0]["number"] + else: + print("Error: Selection information is missing.") + + # output of statistical information + counterSum = counter0 + counter + counter2 + counter3 + counter4 + counter5 + print("Status: Proportion of adjusted occupations found directly in the variants:", counter / (counterSum + 1), + counter) + print("Status: proportion of adjusted occupations found directly in the variants NV:", + counter5 / (counterSum + 1), + counter5) + print("Status: Proportion of adjusted occupations found with Levensthein distance:", counter2 / (counterSum + 1), + counter2) + print("Status: Proportion of adjusted occupations found with Levensthein distance NV:", counter4 / (counterSum + 1), + counter4) + print("Status: Proportion of adjusted occupations not found", counter3 / (counterSum + 1), counter3) + print("Status: Proportion of empty job titles (through cleanup)", counter0 / (counterSum + 1), counter0) + + +def preCreateOccuList(filename, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys): + """ + This function creates a list of occupational information available in a GEDCOM file. + :param filename: designation of the file (string) + :param existingVariantsKldB: data on the already classified occupation information + :param fieldnamesVariants: column headings of the newVariants.csv file + :param fieldnamesDistance: column headings of the levenshteinDistance.csv + :param fieldnamesOccu: column headings of the occuResult.csv file + :param filenameVariants: path and name of the newVariants.csv file + :param filenameDistance: path and name of the levenshteinDistance.csv + :param filenameOccu: path and name of the occuResult.csv file + :param occuKeys: keys for the separated professions + :return: list with location information + """ + # a loop with one pass is necessary to be able to formulate a termination condition + for start in range(1): + # saving the name of the parallelization process + spawnPoolWorker = current_process().name + + # loading data of a GEDCOM file + data = loadGedcomFile(filename) + + # status information + print(spawnPoolWorker, "Status: The analysis of the occupational data for file", filename, "begins.") + + # list of all occupations in one source + allOccupationsInSource = [] + + # iteration of each line in the GEDCOM file + for counter, i in enumerate(data): + # continue if OCCU tag is present + if i[2:6] == "OCCU": + occupation = i[7:] + # some files have the anomaly that the OCCU tag is empty, but the profession information is in the PLAC tag below it + # if this is the case, the information of the next line should be used + if occupation == "": + occupation = data[counter + 1][7:] + allOccupationsInSource.append(occupation) + + # function must be executed iteratively, because otherwise it is called via parallelization + occuList = [] + + # avoid dublets + dubletCounterDict = {} + avoidDublettesList = [] + + for i in allOccupationsInSource: + + # if the variant has already been edited, it should not be edited again + # however, a counter should then be inplemented, which documents the number + if i in avoidDublettesList: # comparison with already processed variants + # count up + dubletCounterDict.update({i: dubletCounterDict[i] + 1}) + + # update of the occuList + # searching for the right row + occuListPosition = next((item for item in occuList if item["variant"] == i), None) + occuListPosition["number"] = dubletCounterDict[i] + # if the occupation information has not yet been processed, then this should be done as follows + else: # occupation statement for the first time in this source + dubletCounterDict.update({i: 1}) + # extension of the list of processed designations + result = createOccuList(i, existingVariantsKldB, filename, dubletCounterDict) + occuList.append(result[0]) # "[0]" at the end is necessary because the function returns a list + avoidDublettesList.append(result[0]["variant"]) + + printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys) + + return (occuList) + + +def createOccuList(phrase, existingVariantsKldB, filename, dubletCounterDict): + """ + This function creates a list of location information available in a source. + :param phrase: occupational designation (string) + :param filename: designation of the file (string) + :param existingVariantsKldB: data on the already classified occupation information (list) + :param filenameVariants: path and name of the newVariants.csv file (string) + :param filename: designation of the file (string) + :param dubletCounterDict: number of same occupational designations in a source (dictionary) + :return: list with occupational information + """ + + # a loop with one pass is necessary to be able to formulate a termination condition + for start in range(1): + # create a list with information about the new variants + occuList = [] # list of unadjusted variants in the source (list entries are dictionaries with a lot of information) + designationList = [] # list of adjusted variants in the source + # if the variant has already been edited, it should not be edited again + # however, a counter should then be inplemented, which documents the number + if phrase in designationList: # comparison with already processed variants + # search for the entry in the occuList that matches the variant + for j in occuList: + if j["variant"] == phrase: + # count up number + j["number"] = j["number"] + 1 + # skip processing + continue; + # if the occupation information has not yet been processed, then this should be done as follows + else: # occupation statement for the first time in this source + # extension of the list of processed designations + designationList.append(phrase) + # variant cleanup + resultOccucleaner = occuCleaner(phrase, existingVariantsKldB) + # completing the file name and setting the occurrence to 1 + resultOccucleaner.update({"source": filename}) + try: + resultOccucleaner.update({"number": dubletCounterDict[phrase]}) + except: + resultOccucleaner.update({"number": 1}) + # adding to the occuList in a dictionary + occuList.append(resultOccucleaner) + + return (occuList) + + +def printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys): + """ + This function creates a list of location information available in a source. + :param occuList: information to the occupational designations (list of dictionaries) + :param existingVariantsKldB: data on the already classified occupation information (list of dictionaries) + :param fieldnamesVariants: column headings of the newVariants.csv file (list of strings) + :param fieldnamesDistance: column headings of the levenshteinDistance.csv (list of strings) + :param fieldnamesOccu: column headings of the occuResult.csv file (list of strings) + :param filenameVariants: path and name of the newVariants.csv file (string) + :param filenameDistance: path and name of the levenshteinDistance.csv (string) + :param filenameOccu: path and name of the occuResult.csv file (string) + :param occuKeys: keys for the separated professions (list of strings) + :return: nothing! (only execution of print orders) + """ + # loading data of new variants + # this is necessary every time, because an identical job title can occur in one of the parallel processes + newVariants = loadData(filenameVariants, "\t", "latin1") + + # if the selection was made on the basis of the Levenshtein distance, this information should be saved + # two lists are created for this purpose + levenList = [] # list is used to create the content for a new row in newVariants.csv + levenList2 = [] # list is used to create the content for a new line in "levenshteindistance.csv + + # Iteration per occupation specification in the source + for i in occuList: + # check all five possible separated professions + for key in occuKeys: + # if entry for the key is not filled in, then skip it + if i[key] == {}: # "": + continue; + if i[key]["selection info"] == "found after levenshtein": + newDict = { + "variant": i[key]["occupation"], + "lemma": existingVariantsKldB[i[key]["row of best fit lemma"]]["variant"], + "code": i[key]["KldB 2010"] + } + levenList.append(newDict) + elif i[key]["selection info"] == "found after levenshtein NV": + newDict = { + "variant": i[key]["occupation"], + "lemma": newVariants[i[key]["row of best fit lemma"]]["variant"], + "code": i[key]["KldB 2010"] + } + levenList.append(newDict) + if i[key]["similarity analysis"] != "": # for levenshteinDistance.csv + levenList2.append(i[key]["similarity analysis"]) + + # blocked printing of new lines in the files + # all files should be at the same level + # so if an error occurs with a variable, all files are not written to + # try: + # unpack dictionary information + unpackInfoList = [] + for j in occuList: + # iterate j (occupations) + unpackInfoDict = {} + for i in j: + # contents of the dictionary are unpacked and written into individual fields + if type(j[i]) == dict: + for dictKey in j[i]: + if dictKey == "absolute distance": + unpackInfoDict.update( + {str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["absolute distance"]}) + if dictKey == "relative distance": + unpackInfoDict.update( + {str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["relative distance"]}) + else: + unpackInfoDict.update({str(i) + "-" + str(dictKey): j[i][dictKey]}) + # delete "similarity analysis" if it is there + try: + unpackInfoDict.pop(i + "-similarity analysis") + except: + pass + # if it is not a dictionary, then the content is taken over like this + else: + unpackInfoDict.update({i: j[i]}) + unpackInfoList.append(unpackInfoDict) + + appendFile(filenameOccu, unpackInfoList, fieldnamesOccu) + appendFile(filenameVariants, levenList, fieldnamesVariants) + # appendFile(filenameDistance, levenList2, fieldnamesDistance) + # except: + # print( + # "Error: Blocked printing of the lines failed. Manual deletion of the entries of the last file appropriate.") + + +if __name__ == '__main__': + # part up to 'parallelization' is executed once at the beginning + + inputDataType = "ged" # data type in which the input data is available, "ged" and "csv" are possible + + # storage of the time at the beginning of the program run + starttime = time.perf_counter() + + if inputDataType == "csv": + pass + elif inputDataType == "ged": + # loading the sources (exemplary here: GEDCOM files from GEDBAS) + # definition of the range in which the file names are located (e.g. 1.ged to 60000.ged) + begin = 0 + end = 60000 + # creation of a list with the possible file names + gedcomNamesList = [] + while begin != end: + datename = str(begin) + ".ged" + gedcomNamesList.append(datename) + begin = begin + 1 + # check if the files exist + # exclude non-existent files + gedcomNamesListClear = [] + for i in gedcomNamesList: + # files are located in the 'data' subfolder + filepath = os.path.join("data", i) + try: + # if opening works, the file exists and is added to a new list + gedcom = open(filepath, "r", encoding="utf-8") + gedcom.close() + gedcomNamesListClear.append(i) + except FileNotFoundError: + pass + + # open more context data + # data from the Historical Data Center of Saxony-Anhalt + # classification based on the Klassifikation der Berufe (KldB, Classification of Professions) + # data from another classification system can also be used here + # file contains already classified occupational variants + filename = os.path.join("data", "variants.csv") + fieldnames = ["idVariant", # unique ID of the occupational variant + "variant", # textual representation of the variant + "code" # code of the OhdAB + ] + # loading data from existing file + # if no file exists, a new one is created + existingVariantsKldB = createFile(filename, fieldnames, ";", "latin1") + + # status message on the number of existing variants + print("Status:", len(existingVariantsKldB), "classified variants already exist.") + + # if halving of variants is to be done for testing purposes, set halving to "yes" + halving = "yes" + # deletion of every second already classified occupation information + if halving == "yes": + remainingVariantsKldB = [] + for number, i in enumerate(existingVariantsKldB): + if number % 2 == 0: + remainingVariantsKldB.append(i) + print("Status: There has been a halving of the variants for testing purposes.", len(remainingVariantsKldB), + "variants remain.") + # overwrite the variable of all variants + existingVariantsKldB = remainingVariantsKldB + + # create file for saving the newly classified files + filenameVariants = os.path.join("data", "newVariants.csv") + fieldnamesVariants = ["variant", # designation of the new variant of an occupation + "lemma", # existing designation of an occupation to which the new variant is assigned + "code" # code according to KldB + ] + createFile(filenameVariants, fieldnamesVariants, "\t", "latin1") + + # list about the best hits for each checked job title + filenameDistance = "levenshteinDistance.csv" + fieldnamesDistance = ["relative distance", # absolute Levenshtein distance divided by the length of the variant + "absolute distance", # absolute Levenshtein distance + "variant", # designation of the new variant of an occupation + "best fit lemma", # designation of the best fitting existing variant + "selection", # binary information whether the lemma was selected (1 means yes, 0 means no) + "lemma row" # number of the line in the existing variants + ] + # createFile(filenameDistance, fieldnamesDistance, "\t", "latin1") + + # list for dividing the different components of a job specification + filenameOccu = "occuResult.csv" + fieldnamesOccu = ["variant", # designation of the new variant of an occupation + "source", # name of the file in which the variant occurs (source) + "number", # Number of occurrences of the variant in the source + "occupation 1-occupation", # information about the first occupation found + "occupation 1-KldB 2010", + "occupation 1-best fit lemma", + "occupation 1-row of best fit lemma", + "occupation 1-titel", + "occupation 1-role", + "occupation 1-year", + "occupation 1-url", + "occupation 1-location", + "occupation 1-further info", + "occupation 1-selection info", + "occupation 1-lemma row", + "occupation 1-absolute distance", + "occupation 1-relative distance", + "occupation 2-occupation", # information about the second occupation found0 + "occupation 2-KldB 2010", + "occupation 2-best fit lemma", + "occupation 2-row of best fit lemma", + "occupation 2-titel", + "occupation 2-role", + "occupation 2-year", + "occupation 2-url", + "occupation 2-location", + "occupation 2-further info", + "occupation 2-selection info", + "occupation 2-similarity analysis", + "occupation 2-lemma row", + "occupation 2-absolute distance", + "occupation 2-relative distance", + "occupation 3-occupation", # information about the third occupation found + "occupation 3-KldB 2010", + "occupation 3-best fit lemma", + "occupation 3-row of best fit lemma", + "occupation 3-titel", + "occupation 3-role", + "occupation 3-year", + "occupation 3-url", + "occupation 3-location", + "occupation 3-further info", + "occupation 3-selection info", + "occupation 3-lemma row", + "occupation 3-absolute distance", + "occupation 3-relative distance", + "occupation 4-occupation", # information about the fourth occupation found + "occupation 4-KldB 2010", + "occupation 4-best fit lemma", + "occupation 4-row of best fit lemma", + "occupation 4-titel", + "occupation 4-role", + "occupation 4-year", + "occupation 4-url", + "occupation 4-location", + "occupation 4-further info", + "occupation 4-selection info", + "occupation 4-lemma row", + "occupation 4-absolute distance", + "occupation 4-relative distance", + "occupation 5-occupation", # information about the fifth occupation found + "occupation 5-KldB 2010", + "occupation 5-best fit lemma", + "occupation 5-row of best fit lemma", + "occupation 5-titel", + "occupation 5-role", + "occupation 5-year", + "occupation 5-url", + "occupation 5-location", + "occupation 5-further info", + "occupation 5-selection info", + "occupation 5-lemma row", + "occupation 5-absolute distance", + "occupation 5-relative distance", + ] + createFile(filenameOccu, fieldnamesOccu, "\t", "latin1") + + # definition of the keys for the separated professions + occuKeys = ["occupation 1", "occupation 2", "occupation 3", "occupation 4", "occupation 5"] + + # initialization of a list in which the results of the upcoming parallelized process are stored + # this will process a list of occupation details in parallel + # the result is a list of dictionaries containing different information about the analysis (occuList) + occuList = [] + + # parallelization + if inputDataType == "csv": + occupationsList = loadData("occupations.csv", ";", "utf-8-sig") + listOfOccupations = [] + for i in occupationsList: + listOfOccupations.append(i["occupation"]) + if inputDataType == "ged": + pass + + pool = Pool(1) # number of cores used is variable + + dubletCounterDict = {} + + if inputDataType == "csv": + # doppelte Berufsangaben zählen + designationList = [] # list of adjusted variants in the source + + for occupation in listOfOccupations: + # if the variant has already been edited, it should not be edited again + # however, a counter should then be inplemented, which documents the number + if occupation in designationList: # comparison with already processed variants + # count up + dubletCounterDict.update({occupation: dubletCounterDict[occupation] + 1}) + + # if the occupation information has not yet been processed, then this should be done as follows + else: # occupation statement for the first time in this source + dubletCounterDict.update({occupation: 1}) + # extension of the list of processed designations + designationList.append(occupation) + + # doppelte Berufsangaben löschen + print("Status: Using a csv file with", len(listOfOccupations), "occupations") + listOfOccupations = set(listOfOccupations) + print("Status: File contains", len(listOfOccupations), "different occupational titles") + + for row in listOfOccupations: + occuList.append(createOccuList(row, existingVariantsKldB, "occupations.csv", dubletCounterDict)[0]) + + printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys) + + elif inputDataType == "ged": + + occuList = pool.map(partial(preCreateOccuList, + existingVariantsKldB=existingVariantsKldB, + fieldnamesVariants=fieldnamesVariants, + fieldnamesDistance=fieldnamesDistance, + fieldnamesOccu=fieldnamesOccu, + filenameVariants=filenameVariants, + filenameDistance=filenameDistance, + filenameOccu=filenameOccu, + occuKeys=occuKeys), gedcomNamesListClear) + + + else: + print("Error: No valide inputDataType") + + pool.close() + pool.join() + + # second processing loop for the designations that are not found but have components + # Example: "farmer and craftsman" is not found, but "farmer" and "craftsman" are found individually + print("Status: Second processing started") + + # second processing + gedcomNamesListClear2 = [] + # iterate all original occupation information + for i in occuList: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + try: + if i[key] == {}: + continue + # only professions that are "not found" + if i[key]["selection info"] == "not found": + gedcomNamesListClear2.append(i[key]["occupation"]) + except: # if it is still in a list with only one value + if i == [] or i[0][key] == {}: + continue + # only professions that are "not found" + if i[0][key]["selection info"] == "not found": + gedcomNamesListClear2.append(i[0][key]["occupation"]) + + # parallelization + pool = Pool(1) # number of cores used is variable + occuList2 = pool.map(partial(createOccuList, + existingVariantsKldB=existingVariantsKldB, + dubletCounterDict=dubletCounterDict, + filename="second try"), gedcomNamesListClear2) + + pool.close() + pool.join() + + # unpack list + occuList2new = [] + for oneOccu in occuList2: + occuList2new.append(oneOccu[0]) + occuList2 = occuList2new + + printOccuList(occuList2, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys) + + # the same again for a third iteration + + # third processing + print("Status: Third processing started") + gedcomNamesListClear3 = [] + # iterate all original occupation information + for i in occuList2: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + try: + if i[key] == {}: + continue; + # only professions that are "not found" + if i[key]["selection info"] == "not found": + gedcomNamesListClear3.append(i[key]["occupation"]) + except: # if it is still in a list with only one value + if i[0][key] == {}: + continue + # only professions that are "not found" + if i[0][key]["selection info"] == "not found": + gedcomNamesListClear3.append(i[0][key]["occupation"]) + + # parallelization + pool = Pool(1) # number of cores used is variable + occuList3 = pool.map(partial(createOccuList, + existingVariantsKldB=existingVariantsKldB, + dubletCounterDict=dubletCounterDict, + filename="third try"), gedcomNamesListClear3) + + pool.close() + pool.join() + + # unpack list + occuList3new = [] + for oneOccu in occuList3: + occuList3new.append(oneOccu[0]) + occuList3 = occuList3new + + printOccuList(occuList3, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys) + + # creation of statistics for the three iterations + statistics(occuList, occuKeys) + statistics(occuList2, occuKeys) + statistics(occuList3, occuKeys) + + # storage of the time at the ending of the program run + finishtime = time.perf_counter() + + # status info + print("Status: Program finished in", round(finishtime - starttime, 2), "seconds(s)") diff --git a/2022_005_goldberg/Skripte/Scraper/main.py b/2022_005_goldberg/Skripte/Scraper/main.py new file mode 100644 index 0000000000000000000000000000000000000000..7334d6e3c5339ce47a7f91be620e7c035c883d5f --- /dev/null +++ b/2022_005_goldberg/Skripte/Scraper/main.py @@ -0,0 +1,71 @@ +# Ignore the following warning: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gedbas.genealogy.net'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# Scraper based on introduction on https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460 +# published from Julia Kho on 27 September 2019, last open on 2 July 2021 + +# import of libraries +import requests +from bs4 import BeautifulSoup +import io + +# initialisation of counters +number = 0 +found = 0 +notAllowed = 0 +notFound = 0 +notFoundSeries = 0 +end = 10000 +empty = 0 + +# possible GEDCOM files were determined by varying the URL +# the end of the URL is changed with a number that is incremented +# execution of the iteration until no result is found 10000 times (value of variable "end") +while notFoundSeries != end: + # definition of the URL + url = "https://gedbas.genealogy.net/gedcom/export/" + str(number) + try: + # scraping the information + response = requests.get(url, verify=False) + gedcom = str(BeautifulSoup(response.text, "html.parser")) + # analysis of the information + # with "not allowed" no public access to the file is possible + # with "not found", the file is not (no longer) available + # in all other cases the information is written to a GEDCOM file + if gedcom != "not found" and gedcom != "not allowed" and gedcom != "": + filename = str(number) + ".ged" + file = io.open(filename, "w", encoding="utf-8") + file.write(gedcom) + file.close() + # count number of detected files + found = found + 1 + # resetting the counter that counts the number of unsuccessful calls in series + notFoundSeries = 0 + # count number of not allowed files + if gedcom == "not allowed": + notAllowed = notAllowed + 1 + notFoundSeries = 0 + # count number of not more existing files + if gedcom == "not found": + notFound = notFound + 1 + notFoundSeries = notFoundSeries + 1 + # count number of empty gedcom files + if gedcom == "": + empty = empty + 1 + notFoundSeries = 0 + except: + print("Status: There is an error in file " + str(number) + ".") + # gives info every 1000 urls + if number % 1000 == 0: + print("Status:", str(number) + " urls were analyzed") + # count up per analysed URL + number = number + 1 + +# printing status information +print("Status: Scraping finished") +print("Status: " + str(found) + " files could be found") +print("Status: Access was denied for " + str(notAllowed) + " files") +print("Status: " + str(notFound - end) + " files were deleted") +print("Status: " + str(empty) + " files were blank") diff --git a/2022_005_goldberg/Skripte/occupationMain.py b/2022_005_goldberg/Skripte/occupationMain.py new file mode 100644 index 0000000000000000000000000000000000000000..d90b58d4fee674e6d8ba70259e218b985281754d --- /dev/null +++ b/2022_005_goldberg/Skripte/occupationMain.py @@ -0,0 +1,1047 @@ +import csv +import os.path +from multiprocessing import Pool, current_process +from functools import partial +import time +import json +import Levenshtein +import copy + + +def loadData(filename, delimiter, encoding): + """ + This function is used to open files in which data is temporarily stored and was created by the program in a previous run. + :param filename: designation of the file + :param delimiter: type of delimiter as string + :return: list of dictionaries with information of the file to be loaded + """ + content = [] # list oft dicts + try: + with open(filename, "r", encoding=encoding) as data: # , errors='ignore' + for i in csv.DictReader(data, delimiter=delimiter): + i = json.loads(json.dumps(i)) + content.append(i) + except FileNotFoundError: + print("Status: Inital pass for file", filename, "(no list created yet).") + return (content) + + +def appendFile(filename, data, fieldnames): + """ + This function describes CSV files. + :param filename: designation of the file (string) + :param data: type of delimiter (string) + :param fieldnames: column names of the file to be written (list of strings) + """ + opener = open(filename, "a", newline='', encoding="utf-8") + writer = csv.DictWriter(opener, fieldnames=fieldnames, delimiter="\t") + # differentiation of cases where one or more lines are to be added + # for the file "qualityofgedcom.csv" only one line should be written at a time + # for all other files several lines should be written + if filename == "qualityofgedcom.csv": + writer.writerow(data) + else: + writer.writerows(data) + opener.close() + + +def createFile(filename, fieldnames, delimiter, encoding): + """ + This function creates a new file if no file already exists under this name. + The function is also used to load data when it is clear that the file already exists. + :param filename: designation of the file (string) + :param fieldnames: column names of the file to be written (list of strings) + :param delimiter: type of delimiter (string) + :return: list of dictionaries with information of the file to be loaded + """ + content = loadData(filename, delimiter, encoding) + # create a new file if it is not there + if len(content) == 0: # check if the variable does not contain any data + opener = open(filename, "w", newline='', encoding="utf-8") + writer = csv.writer(opener, delimiter=delimiter) + writer.writerow(fieldnames) + opener.close() + return (content) + + +def loadGedcomFile(filename): + """ + This function loads the data of a GEDCOM file and writes them line by line into a list. + :param filename: name of the file + :return: in case of error "NONE", otherwise a list with the information of the GEDCOM file + """ + # define file path + filepath = os.path.join("data", filename) + preparedData = [] + try: + gedcom = open(filepath, "r", encoding="utf-8") + data = gedcom.readline() + # initial transfer of the headline + data = data[:-1] # delete the unimportant last character of each line + while data != "": # last line is empty + data = str(gedcom.readline()) + data = data[:-1] # delete the unimportant last character of each line + preparedData.append(data) + gedcom.close() + return (preparedData) + except FileNotFoundError: + print("Error: There is a problem with access to the file", filename, ".") + return ("NONE") + + +def separator(occu, replaced, replacer): + """ + This function is used to replace separation operators. + :param occu: string that is processed + :param replaced: content to be replaced + :param replacer: place of the one to be replaced + :return: new string with changed content + """ + if replaced in occu: + occu = occu.replace(replaced, replacer) + return (occu) + + +def endOfString(phrase, signalWord): + """ + This function is used to detect the position of an element of a string. + The respective end position of a part is determined, if it exists. + Everything before this position is removed. + :param phrase: string to be searched (string) + :param signalWord: displays a place name (string) + :return: text after the end position of the signal word in the phrase + """ + # if phrase contains the signal word, then find end position of the signal word and remove everything behind + if signalWord in phrase: + endOfString = phrase[(phrase.find(signalWord) + len(signalWord)):] + return (endOfString) + return ("") + + +def replaceLoc(signalWord, phrase, loc): + """ + This function is used to store location names. + :param signalWord: displays a place name (string) + :param phrase: string to be searched (string) + :param loc: designation of a place (string) + :return: adjusted occupation phrase + """ + if signalWord in phrase: + phrase = phrase.replace(signalWord, "") # remove "signalWord" + phrase = phrase.replace(loc, "") # remote location + return (phrase) + + +def dictSearch(relevantDict, key, relevantObject): + """ + This function searches a given list of dictionaries for a searched value and specifies the key. + :param relevantDict: list of dictionaries that will be searched + :param key: key of the dictionary to be studied + :param relevantObject: name of the value to be searched for under the key in the Dictionary + :return: number of the searched dictionary in the list (if none is found "-1") + """ + # search per list comprehension + # note: upper and lower case is relevant here + occuIndex = next((index for (index, d) in enumerate(relevantDict) if d[key] == relevantObject), None) + if occuIndex is None: + return (-1) # if it could not be found + return (occuIndex) + + +def partCorrector(phrase, existingVariantsKldB): + """ + This function cleans up a location specification. + Information that is not related to the location will be filtered out. + In addition, an attempt is made to find a lemma for this occupation. + :param phrase: occupation (string) + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :return: information about the occupation (dictionary) + """ + # initialization of variables, so that exist + titel = "" + role = "" + year = "" + url = "" + brackets = "" + + # step 5: geographic prepositions ("loc" stands for location) + # find and save place names + # place name is currently overwritten if several of them occur + # there are signal words that indicate a subsequent location (e.g. "in", "im") + loc = endOfString(phrase, " in ") # "loc" is needed for the upcoming function + phrase = replaceLoc(" in ", phrase, loc) + loc = endOfString(phrase, " im ") + phrase = replaceLoc(" im ", phrase, loc) + loc = endOfString(phrase, " In ") + phrase = replaceLoc(" In ", phrase, loc) + loc = endOfString(phrase, " i. ") + phrase = replaceLoc(" i. ", phrase, loc) + loc = endOfString(phrase, " von ") + phrase = replaceLoc(" von ", phrase, loc) + loc = endOfString(phrase, " v. ") + phrase = replaceLoc(" v. ", phrase, loc) + loc = endOfString(phrase, " zu ") + phrase = replaceLoc(" zu ", phrase, loc) + loc = endOfString(phrase, " auf ") + phrase = replaceLoc(" auf ", phrase, loc) + loc = endOfString(phrase, " aus ") + phrase = replaceLoc(" aus ", phrase, loc) + loc = endOfString(phrase, " Aus ") + phrase = replaceLoc(" Aus ", phrase, loc) + loc = endOfString(phrase, " an ") + phrase = replaceLoc(" an ", phrase, loc) + loc = endOfString(phrase, " der ") + phrase = replaceLoc(" der ", phrase, loc) + loc = endOfString(phrase, " des ") + phrase = replaceLoc(" des ", phrase, loc) + loc = endOfString(phrase, " van ") + phrase = replaceLoc(" van ", phrase, loc) + + # besides location information there are signal words for employers + # "loc" continues to be used here, even though the literal sense no longer fits here + loc = endOfString(phrase, " bei ", ) + phrase = replaceLoc(" bei ", phrase, loc) + loc = endOfString(phrase, " bei dem ") + phrase = replaceLoc(" bei dem ", phrase, loc) + loc = endOfString(phrase, " beim ") + phrase = replaceLoc(" beim ", phrase, loc) + loc = endOfString(phrase, " bei der ") + phrase = replaceLoc(" bei der ", phrase, loc) + + # then there are signal words in front of an occupation, which makes clear the affiliation to a dominion + affiliation = ["herrschaftlich", "herrschaftliche", "herrschaftlicher", "königlich", "königliche", "königlicher", + "fürstlich", "fürstliche", "fürstlicher"] + for i in affiliation: + if i in phrase: + # this information should not be deleted from the occupation statement + # it should only be stored in "loc" to be output separately afterwards + # if "loc" is empty, then no comma should precede it + if loc != "": + loc = loc + ", " + i + else: + loc = i + + # find and save years + # more detailed dates are made to year information + # assumption: Year numbers always have four digits and are at the beginning + # check if the first character is a number + if phrase[:1].isdigit() is True: + # check if the first four characters are a number + if phrase[:4].isdigit() is True: + # separate year and part behind + year = phrase[:4] + phrase = phrase[4:] + + # brackets content + if "(" in phrase and ")" in phrase: + brackets = phrase[phrase.find("("):phrase.find(")")] + phrase = phrase[:phrase.find("(")] + phrase[phrase.find(")") + 2:] # +2 because of parenthesis and space + if "[" in phrase and "]" in phrase: + brackets = phrase[phrase.find("["):phrase.find("]")] + phrase = phrase[:phrase.find("[")] + phrase[phrase.find("]") + 2:] # +2 because of parenthesis and space + + # find and save URLs + # example: <a href="https:undde.wikipedia.org/wiki/Geschichte_des_Kantons_Thurgau#Grafen_im_Thurgau">Graf im Thurgau</a> + if "<a" in phrase and "</a>" in phrase: + url = phrase[phrase.find("<a"):phrase.find("</a>")] + phrase = phrase[:phrase.find("<a")] + phrase[phrase.find("</a>"):] + + # find and save role + # wife + if "F. d." in phrase: + role = "Frau" + phrase = endOfString(phrase, "F. d.") + if "Ehefrau des" in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau des") + if "Ehefrau d." in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau d.") + if "Ehefrau" in phrase: + role = "Frau" + phrase = endOfString(phrase, "Ehefrau") + if "frau" in phrase and "Haus" != phrase[:4] and "Acker" != phrase[:5]: + role = "Frau" + phrase = phrase.replace("sfrau", "") + phrase = phrase.replace("frau", "") + # daugther + if "T. d." in phrase: + role = "Tochter" + phrase = endOfString(phrase, "T. d.") + if "tochter" in phrase: + role = "Tochter" + phrase = phrase.replace("stochter", "") + phrase = phrase.replace("tochter", "") + # son + if "S. d." in phrase: + role = "Sohn" + phrase = endOfString(phrase, "S. d.") + if "sohn" in phrase: + role = "Sohn" + phrase = phrase.replace("ssohn", "") + phrase = phrase.replace("sohn", "") + + # find and save titles + if "Prof." in phrase: + titel = "Professor" + phrase = endOfString(phrase, "Prof.") + if "Professor" in phrase: + titel = "Professor" + phrase = endOfString(phrase, "Professor") + + # step 9: temporal prepositions and numerals + if " am " in phrase: + year = endOfString(phrase, " am ") + phrase = phrase.replace(" am ", "") + phrase = phrase.replace(year, "") + if " bis " in phrase: + year = endOfString(phrase, " bis ") + phrase = phrase.replace(" bis ", "") + phrase = phrase.replace(year, "") + + # delete numbers, unless they end with a dot or there are 4 consecutive digits, then this is taken as year + numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] + numberLength = 0 + prePart = phrase + for i in range(len(phrase)): + if prePart[i:i + 1] in numbers: + numberLength = numberLength + 1 + if prePart[i + 1:i + 2] != "." and prePart[i + 1:i + 2] not in numbers: + if numberLength == 4: + year = prePart[i - 3:i + 1] + phrase = phrase.replace(year, "") + numberLength = 0 + else: + phrase = phrase.replace(phrase[i - numberLength + 1:i + 1], "") + numberLength = 0 + elif phrase[i + 1:i + 2] == ".": + numberLength = 0 + + # remove remaining special characters + phrase = phrase.replace(":", "") + + # remove blanks here again + # "cleanedOccupation" is what remains of the occupation specification + cleanedOccupation = phrase.strip() + + # search if there is a corresponding pedant in the already classified occupational data + occuIndex = dictSearch(existingVariantsKldB, "Variante", cleanedOccupation) + # if occuIndex is not "-1", then a counterpart was found + if occuIndex != -1: + # KldB identifier + kldb = existingVariantsKldB[occuIndex]["OhdAB_01"] + # way of selection of a counterpart + info = "found direct" + # levDict stands in front of Levenshtein dictionary + # name not appropriate here, because no Levenshtein distance is used + # for uniformity of the variable it is used anyway + levDict = {"lemma row": occuIndex, # line of the matching dictionary + "variant": "", + "best fit lemma": existingVariantsKldB[occuIndex]["Variante"], + # designation of the appropriate occupation + "absolute distance": "", + "relative distance": "", + "selection": "" + } + # if occuIndex is "-1", no counterpart was found and a similarity analysis starts + elif occuIndex == -1 and cleanedOccupation != "": # cleanedOccupation must not be empty + # similarity analysis + levDict = levenshteinDist(existingVariantsKldB, "Variante", cleanedOccupation, "OhdAB_01") + # setting the relative Levenshtein distance of 0.25 as the essential threshold for selection + if levDict["relative distance"] < 0.25: + levDict.update({"selection": 1}) + kldb = existingVariantsKldB[levDict["lemma row"]]["OhdAB_01"] # take the line here from the levDict + # way of selection of a counterpart + info = "found after levenshtein" + else: + # no counterpart found + levDict.update({"selection": 0}) + kldb = "" + info = "not found" + # no occupation remains + else: + kldb = "" + info = "no occupational designation" + levDict = {"lemma row": "", "variant": "", "best fit lemma": "", "absolute distance": "", + "relative distance": "", "selection": ""} + + # store the information sorted for each phrase (occupation) + occupationResult = { + "occupation": cleanedOccupation, + "best fit lemma": levDict["best fit lemma"], + "row of best fit lemma": levDict["lemma row"], + "KldB 2010": kldb, + "titel": titel, + "role": role, + "location": loc, + "year": year, + "url": url, + "further info": brackets, + "selection info": info, + "similarity analysis": levDict + } + return (occupationResult) + + +def abbreviationsCorrector(firstString, secondString): + """ + This function compares two phrases and checks if one of them could be an abbreviation of the other. + If "s"econdString" is an abbreviation of "firstString", "firstString" will be returned truncated. + :param firstString: first phrase without abbreviation (string) + :param secondString: second phrase with abbreviation (string) + :return: resolved abbreviation of "firstString" (string) + """ + # continue only if there is a dot in "secondString" + # first letters equal to runtime improvement + if "." in secondString and secondString[:1] == firstString[:1]: + positionDot = secondString.find(".") + # find the abbreviated part in the other string and delete it in the original name + # count backwards to find blanks + for position in range(positionDot, 0, -1): + if secondString[positionDot:positionDot + 1] == " ": + beforeDot = secondString[position:positionDot] + break; + elif position == 1: + beforeDot = secondString[:positionDot] + + # testing minimum length + try: + # minimum length before 3 letters + if positionDot - position < 4: + # if less than three letters, return original value + return (firstString) + except UnboundLocalError: + position = 0 + beforeDot = secondString[position:positionDot] + # minimum length before 3 letters + if positionDot - position < 4: + # if less than three letters, return original value + return (firstString) + + if beforeDot in firstString: + positionPart = firstString.find(beforeDot) + len(beforeDot) + for position in range(positionPart, len(firstString) + 1): + # blank, hyphen or general end; +1 is allowed here, is then simply empty + if firstString[position:position + 1] == " " or firstString[position:position + 1] == "-" or position == len( + firstString): + positionEnd = position + break; + # abbreviation found, abbreviate original name + firstString = firstString[:positionPart] + ". " + firstString[positionEnd:] + return (firstString) + + +def levenshteinDist(existingVariantsKldB, key, relevantObject, keyRelevantDict): + """ + This function generates the Levenshtein distance between two strings. + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :param key: designation of the key for the "relevantDict" (string) + :param relevantObject: occupation for which a similar, already classified value is to be found (string) + :param keyRelevantDict: # vame of the column that contains for the identifier (string) + :return: information on similarity analysis (dictionary) + """ + # the best fitting value is to be found + # initial high values for a Levenshtein distance, which are undercut in any case + minimalDistAbs = 99999 # absolute + minimalDistRel = 99999 # relative + # binary variable, 0 if no hit was found, 1 if at least one hit was found + minOneFound = 0 + # check against each existing entry + for counter, i in enumerate(existingVariantsKldB): + # Lower case for better comparability + relevantObjectLowerCase = relevantObject.lower() + existingVariantLowerCase = copy.copy( + i[keyRelevantDict]).lower() # copy important because it is changed afterwards + # compare only if first letters are the same (serves to improve runtime) + if existingVariantLowerCase[:1] == relevantObjectLowerCase[:1]: + # calculate Levenshtein distance + levDistAbs = Levenshtein.distance(existingVariantLowerCase, relevantObjectLowerCase) + # levDist multiply with number of blanks (+1) to avoid "gewesener königlicher Richter"/"gewesener königlicher Koch" + levDistRel = levDistAbs * (relevantObject.count(" ") + 1) / len(relevantObject) + # when the next one fits better + if levDistRel < minimalDistRel: + minimalDistAbs = levDistAbs + minimalDistRel = levDistRel + bestFitLemma = i[keyRelevantDict] + cacheCounter = counter + # is overwritten until an equal one comes along + hitlist = [[i[keyRelevantDict], cacheCounter]] + # if the next one fits equally well + if levDistRel == minimalDistRel: + hitlist.append([i[keyRelevantDict], counter]) + # at least one hit + minOneFound = 1 + # no similarity + else: + continue; + + # select one in case of multiple hits + # selection is made by greatest match from the front (matching letters) + try: + # if there were several hits of the same quality + # anything above 0.25 is assumed to be unrealistic here, serves to improve runtime + if len(hitlist) > 1 and minimalDistRel < 0.25: + # initialization of counters + numberMatchingChars = 0 + maxNumberMatchingChars = 0 + numberMatchingCharsList = [] + for charPosition, j in enumerate(hitlist): + # if the respective letters of the strings to be compared are the same + if j[0][charPosition:charPosition + 1] == relevantObject[charPosition:charPosition + 1]: + # count up + numberMatchingChars = numberMatchingChars + 1 + # note the maximum number of characters + maxNumberMatchingChars = numberMatchingChars + # reset, if another character comes + else: + numberMatchingChars = 0 + numberMatchingCharsList.append([charPosition, maxNumberMatchingChars]) + + # Selection of the result with the closest match (no longer has anything to do with Levenshtein distance) + longestMatch = 0 + # iterate all results of the maxNumberMatchingCharsList + for j in numberMatchingCharsList: + # select so most suitable + if j[1] > longestMatch: # [1] is maxNumberMatchingChars + longestMatch = j[1] + charPosition = j[0] # [0] is charPosition + # there can be best results for the same time + # that is ignored at this point + # only one status message is issued + # the second, equally matching value, is not selected + if j[1] == longestMatch: + # this may be due to the fact that equal values are compared + # duplicates exist in the list of already classified occupational data + # therefore values to be compared can be the same + if hitlist[j[0]][0] == hitlist[charPosition][0]: + print("Status: A duplicate exists in the list:", hitlist[j[0]][0], hitlist[charPosition][0]) + continue; + # but the values do not always have to be the same, they can also just have the same beginning + print("Status: Two very similar values exist in the list:", hitlist[j[0]], longestMatch, + relevantObject) + + # overwrite the relevant variables + bestFitLemma = hitlist[charPosition][0] + cacheCounter = hitlist[charPosition][1] + except UnboundLocalError: + pass; + + # alternative, if the possibility above did not lead to success + # this may be due to the fact that abbreviations are included + if minimalDistRel >= 0.25: + # search for abbreviations marked with a dot + for counter, i in enumerate(existingVariantsKldB): + designationCopy = relevantObject.lower() + originalDesignation = copy.copy(i[key]).lower() # copy important because it is changed afterwards + # only if first letters are equal (runtime improvement) + if originalDesignation[:1] == designationCopy[:1]: + # abbreviation handling + preDesignationCopy = designationCopy # save previous value + designationCopy = abbreviationsCorrector(designationCopy, originalDesignation) + if designationCopy == preDesignationCopy: + # the same again the other way around + originalDesignation = abbreviationsCorrector(originalDesignation, designationCopy) + levDist = Levenshtein.distance(originalDesignation, designationCopy) + if levDist < minimalDistRel: + minimalDistRel = levDist + # if the new value is smaller, then overwrite relevant variables + bestFitLemma = i[key] + cacheCounter = counter + # at least one hit + minOneFound = 1 + + if minOneFound == 0: + bestFitLemma = "nothing" # occurs, if e.g. the first letter is a colon; there is no variant to + cacheCounter = -1 + # merge information + levenDict = { + "lemma row": cacheCounter, + "variant": relevantObject, + "best fit lemma": bestFitLemma, + "absolute distance": minimalDistAbs, + "relative distance": minimalDistRel + } + return (levenDict) + + +def occucleaner(occu, existingVariantsKldB): + """ + This function cleans up individual occupation information. + It is also essential that various information is separated from the original job title. + This can concern several job titles, but also non-professional information. + :param occu: occupational title + :param existingVariantsKldB: already classified occupation KldB (list of dictionaries) + :return: information about the different occupational indications in the original indication (dictionary) + """ + + # storage of the original occupational title + originalOccu = occu + + # Initialisierung + # "occu1" does not need to be initialized because there is at least one occupation specification + occu2 = "" + occu3 = "" + occu4 = "" + occu5 = "" + + # Initialisierung + part1 = "" + part2 = "" + part3 = "" + part4 = "" + part5 = "" + + # general preprocessing + + # step 1: Remove spaces at the beginning and end + occu = occu.strip() + + # step 2: Write out abbreviations + if "mstr." in occu: + occu = occu.replace("mstr.", "meister") + if "Ing." in occu: + occu = occu.replace("Ing.", "Ingenieur") + + # step 3: Normalize separation operators + occu = separator(occu, " u.", " und") + occu = separator(occu, "+", " und ") # there are also "und" (and) without spaces + occu = separator(occu, ", ", " und ") + occu = separator(occu, ",", " und ") + occu = separator(occu, "; ", " und ") + occu = separator(occu, " & ", " und ") + occu = separator(occu, " / ", " und ") + occu = separator(occu, "/", " und ") + + # detail processing + + # separate multiple occupations + partList = [part1, part2, part3, part4, part5] # parts are still all empty here + partCounter = 0 + trennoperator = " und " + partList[0] = occu # is needed for initialization because the while loop accesses the next one + # < 4, because not infinite parts should be made + while trennoperator in partList[partCounter] and partCounter < 4: + st = partList[partCounter] + partList[partCounter] = st[:st.find(" und ")] + partList[partCounter + 1] = st[(st.find(" und ") + len(" und ")):] + partCounter = partCounter + 1 + + # Werte aus Party zurückschreiben + part1 = partList[0] + part2 = partList[1] + part3 = partList[2] + part4 = partList[3] + part5 = partList[4] + + if partCounter == 0: # wenn es nur einen Teil gibt + part1 = occu + + # dict adden zum dict bezeichnungen + occu1 = partCorrector(part1, existingVariantsKldB) + if part2 != "": # wenn es kein part2 gibt, dann einfach weiter + occu2 = partCorrector(part2, existingVariantsKldB) + if part3 != "": # kann nur sein, wenn Part 2 vorher auch war + occu3 = partCorrector(part3, existingVariantsKldB) + if part4 != "": + occu4 = partCorrector(part4, existingVariantsKldB) + if part5 != "": + occu5 = partCorrector(part5, existingVariantsKldB) + # weitere parts hinzufügen + + # information about the different occupational indications in the original indication + bezeichnung = { + "variant": originalOccu, + "occupation 1": occu1, # occu1 ist ein dict (beruf) + "occupation 2": occu2, + "occupation 3": occu3, + "occupation 4": occu4, + "occupation 5": occu5 + } + + return (bezeichnung) + + +def statistics(occuList, occuKeys): + """ + This function counts the number of lemmatizations over the different process branches. + :param occuList: list of dictionaries with information to analysed occupational information + :param occuKeys: column headings for the analysis of separated occupations + """ + # + counter = 0 # found directly in existing variants + counter0 = 0 # empty occupational designations (only came about as a result of cleanup, e.g. because only location information was given) + counter2 = 0 # found by Levenshtein distance + counter3 = 0 # could not be found + counter4 = 0 # found by Levenshtein distance NV + counter5 = 0 # found directly in existing variants NV + + for i in occuList: + # iterate all the occupations sparred in it + for j in i: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + if j[key] == "": + continue; + elif j[key]["selection info"] == "found direct": + counter = counter + j["number"] + elif j[key]["selection info"] == "found after levenshtein": + counter2 = counter2 + j["number"] + elif j[key]["selection info"] == "not found": + counter3 = counter3 + j["number"] + elif j[key]["selection info"] == "no occupational designation": + counter0 = counter0 + j["number"] + elif j[key]["selection info"] == "found after levenshtein NV": + counter4 = counter4 + j["number"] + elif j[key]["selection info"] == "found direct NV": + counter5 = counter5 + j["number"] + else: + print("Error: Selection information is missing.") + + # output of statistical information + counterSum = counter0 + counter + counter2 + counter3 + counter4 + counter5 + print("Status: Proportion of adjusted occupations found directly in the variants:", counter / counterSum, + counter) + print("Status: proportion of adjusted occupations found directly in the variants NV:", + counter5 / counterSum, + counter5) + print("Share ... Levensthein distance:", counter2 / counterSum, counter2) + print("Share ... Levensthein distance NV:", counter4 / counterSum, counter4) + print("Share ... not found", counter3 / counterSum, counter3) + print("Share of empty job titles (through cleanup)", counter0 / counterSum, counter0) + + +def preCreateOccuList(filename, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys): + """ + This function creates a list of location information available in a GEDCOM file. + :param filename: designation of the file (string) + :param existingVariantsKldB: data on the already classified occupation information + :param fieldnamesVariants: column headings of the newVariants.csv file + :param fieldnamesDistance: column headings of the levenshteinDistance.csv + :param fieldnamesOccu: column headings of the occuResult.csv file + :param filenameVariants: path and name of the newVariants.csv file + :param filenameDistance: path and name of the levenshteinDistance.csv + :param filenameOccu: path and name of the occuResult.csv file + :param occuKeys: keys for the separated professions + :return: list with location information + """ + # a loop with one pass is necessary to be able to formulate a termination condition + for start in range(1): + # saving the name of the parallelization process + spawnPoolWorker = current_process().name + + # loading data of a GEDCOM file + data = loadGedcomFile(filename) + + # status information + print(spawnPoolWorker, "Status: The analysis of the occupational data for file", filename, "begins.") + + # list of all occupations in one source + allOccupationsInSource = [] + + # iteration of each line in the GEDCOM file + for counter, i in enumerate(data): + # continue if OCCU tag is present + if i[2:6] == "OCCU": + occupation = i[7:] + # some files have the anomaly that the OCCU tag is empty, but the profession information is in the PLAC tag below it + # if this is the case, the information of the next line should be used + if occupation == "": + occupation = data[counter + 1][7:] + allOccupationsInSource.append(occupation) + + # function must be executed iteratively, because otherwise it is called via parallelization + occuList = [] + for i in allOccupationsInSource: + occuList.append(createOccuList(i, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys, filename)[ + 0]) # "[0]" at the end is necessary because the function returns a list + return (occuList) + + +def createOccuList(phrase, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu, + filenameVariants, filenameDistance, filenameOccu, occuKeys, filename): + """ + This function creates a list of location information available in a source. + :param filename: designation of the file + :param existingVariantsKldB: data on the already classified occupation information + :param fieldnamesVariants: column headings of the newVariants.csv file + :param fieldnamesDistance: column headings of the levenshteinDistance.csv + :param fieldnamesOccu: column headings of the occuResult.csv file + :param filenameVariants: path and name of the newVariants.csv file + :param filenameDistance: path and name of the levenshteinDistance.csv + :param filenameOccu: path and name of the occuResult.csv file + :param occuKeys: keys for the separated professions + :param filename: designation of the file (string) + :return: list with location information + """ + # loading data of new variants + # this is necessary every time, because an identical job title can occur in one of the parallel processes + newVariants = loadData(filenameVariants, "\t", "latin1") + + # a loop with one pass is necessary to be able to formulate a termination condition + for start in range(1): + # create a list with information about the new variants + occuList = [] # list of unadjusted variants in the source (list entries are dictionaries with a lot of information) + designationList = [] # list of adjusted variants in the source + + # if the variant has already been edited, it should not be edited again + # however, a counter should then be incremented, which documents the number + if phrase in designationList: # comparison with already processed variants + # search for the entry in the occuList that matches the variant + for j in occuList: + if j["variant"] == phrase: + # count up number + j["number"] = j["number"] + 1 + # skip processing + continue; + # if the occupation information has not yet been processed, then this should be done as follows + else: # occupation statement for the first time in this source + # extension of the list of processed designations + designationList.append(phrase) + # variant cleanup + resultOccucleaner = occucleaner(phrase, existingVariantsKldB) + # completing the file name and setting the occurrence to 1 + resultOccucleaner.update({"source": filename}) + resultOccucleaner.update({"number": 1}) + # adding to the occuList in a dictionary + occuList.append(resultOccucleaner) + + # if the selection was made on the basis of the Levenshtein distance, this information should be saved + # two lists are created for this purpose + levenList = [] # list is used to create the content for a new row in newVariants.csv + levenList2 = [] # list is used to create the content for a new line in "levenshteindistance.csv + + # Iteration per occupation specification in the source + for i in occuList: + # check all five possible separated professions + for key in occuKeys: + # if entry for the key is not filled in, then skip it + if i[key] == "": + continue; + if i[key]["selection info"] == "found after levenshtein": + newDict = { + "variant": i[key]["occupation"], + "lemma": existingVariantsKldB[i[key]["row of best fit lemma"]]["Variante"], + "OhdAB_01": i[key]["KldB 2010"] + } + levenList.append(newDict) + elif i[key]["selection info"] == "found after levenshtein NV": + newDict = { + "variant": i[key]["occupation"], + "lemma": newVariants[i[key]["row of best fit lemma"]]["Variante"], + "OhdAB_01": i[key]["KldB 2010"] + } + levenList.append(newDict) + if i[key]["similarity analysis"] != "": # for levenshteindistance.csv + levenList2.append(i[key]["similarity analysis"]) + + # blocked printing of new lines in the files + # all files should be at the same level + # so if an error occurs with a variable, all files are not written to + try: + appendFile(filenameOccu, occuList, fieldnamesOccu) + appendFile(filenameVariants, levenList, fieldnamesVariants) + appendFile(filenameDistance, levenList2, fieldnamesDistance) + except: + print( + "Error: Blocked printing of the lines failed. Manual deletion of the entries of the last file appropriate.") + return (occuList) + + +if __name__ == '__main__': + # part up to 'parallelization' is executed once at the beginning + + # storage of the time at the beginning of the program run + starttime = time.perf_counter() + # loading the sources (exemplary here: GEDCOM files from GEDBAS) + # definition of the range in which the file names are located (e.g. 1.ged to 60000.ged) + begin = 0 + end = 60000 + # creation of a list with the possible file names + gedcomNamesList = [] + while begin != end: + datename = str(begin) + ".ged" + gedcomNamesList.append(datename) + begin = begin + 1 + # check if the files exist + # exclude non-existent files + gedcomNamesListClear = [] + for i in gedcomNamesList: + # files are located in the 'data' subfolder + filepath = os.path.join("data", i) + try: + # if opening works, the file exists and is added to a new list + gedcom = open(filepath, "r", encoding="utf-8") + gedcom.close() + gedcomNamesListClear.append(i) + except FileNotFoundError: + pass + + # open more context data + # data from the Historical Data Center of Saxony-Anhalt + # classification based on the Klassifikation der Berufe (KldB, Classification of Professions) + # data from another classification system can also be used here + # file contains already classified occupational variants + filename = os.path.join("data", "varianten_goldberg.csv") + fieldnames = ["id_variante", # unique ID of the occupational variant + "variant", # textual representation of the variant + "Erstberuf", # textual standardization of the professional title + "OhdAB_01", # code of the OhdAB + "Berufsrolle", # professional role (e.g. officer, leader) + "Geschlecht", # sex + "Familienrolle", # family role (e.g. daughter) + "PrimaryFirst", # ??? todo + "MatchSequence", # ??? todo + "filter_$" # ??? todo + ] + # loading data from existing file + # if no file exists, a new one is created + existingVariantsKldB = createFile(filename, fieldnames, ";", "latin1") + + # status message on the number of existing variants + print("Status:", len(existingVariantsKldB), "classified variants already exist.") + + # if halving of variants is to be done for testing purposes, set halving to "yes" + halving = "no" + # deletion of every second already classified occupation information + if halving == "yes": + remainingVariantsKldB = [] + for zahl, i in enumerate(existingVariantsKldB): + if zahl % 2 == 0: + remainingVariantsKldB.append(i) + print("Status: There has been a halving of the variants for testing purposes.", len(remainingVariantsKldB), + "variants remain.") + # overwrite the variable of all variants + existingVariantsKldB = remainingVariantsKldB + + # create file for saving the newly classified files + filenameVariants = os.path.join("data", "newVariants.csv") + fieldnamesVariants = ["variant", # designation of the new variant of an occupation + "lemma", # existing designation of an occupation to which the new variant is assigned + "OhdAB_01" # code according to KldB + ] + createFile(filenameVariants, fieldnamesVariants, "\t", "latin1") + + # list about the best hits for each checked job title + filenameDistance = "levenshteinDistance.csv" + fieldnamesDistance = ["relative distance", # absolute Levenshtein distance divided by the length of the variant + "absolute distance", # absolute Levenshtein distance + "variant", # designation of the new variant of an occupation + "best fit lemma", # designation of the best fitting existing variant + "selection", # binary information whether the lemma was selected (1 means yes, 0 means no) + "lemma row" # number of the line in the existing variants + ] + createFile(filenameDistance, fieldnamesDistance, "\t", "latin1") + + # list for dividing the different components of a job specification + filenameOccu = "occuResult.csv" + fieldnamesOccu = ["variant", # designation of the new variant of an occupation + "source", # name of the file in which the variant occurs (source) + "number", # Number of occurrences of the variant in the source + "occupation 1", # information about the first occupation found + "occupation 2", # information about the second occupation found + "occupation 3", # information about the third occupation found + "occupation 4", # information about the fourth occupation found + "occupation 5" # information about the fifth occupation found + ] + createFile(filenameOccu, fieldnamesOccu, "\t", "latin1") + + # definition of the keys for the separated professions + occuKeys = ["occupation 1", "occupation 2", "occupation 3", "occupation 4", "occupation 5"] + + # initialization of a list in which the results of the upcoming parallelized process are stored + # this will process a list of occupation details in parallel + # the result is a list of dictionaries containing different information about the analysis (occuList) + occuList = [] + + # parallelization + pool = Pool(3) + occuList = pool.map(partial(preCreateOccuList, + existingVariantsKldB=existingVariantsKldB, + fieldnamesVariants=fieldnamesVariants, + fieldnamesDistance=fieldnamesDistance, + fieldnamesOccu=fieldnamesOccu, + filenameVariants=filenameVariants, + filenameDistance=filenameDistance, + filenameOccu=filenameOccu, + occuKeys=occuKeys), gedcomNamesListClear) + pool.close() + pool.join() + + # second processing loop for the designations that are not found but have components + # Example: "farmer and craftsman" is not found, but "farmer" and "craftsman" are found individually + + # second processing + gedcomNamesListClear2 = [] + # iterate all original occupation information + for i in occuList: + # iterate all the occupations sparred in it + for j in i: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + if j[key] == "": + continue; + # only professions that are "not found" + if j[key]["selection info"] == "not found": + gedcomNamesListClear2.append(j["occupation 1"]["occupation"]) + + # parallelization + pool = Pool(3) + occuList2 = pool.map(partial(createOccuList, + existingVariantsKldB=existingVariantsKldB, + fieldnamesVariants=fieldnamesVariants, + fieldnamesDistance=fieldnamesDistance, + fieldnamesOccu=fieldnamesOccu, + filenameVariants=filenameVariants, + filenameDistance=filenameDistance, + filenameOccu=filenameOccu, + occuKeys=occuKeys, + filename=""), gedcomNamesListClear2) + pool.close() + pool.join() + + # the same again for a third iteration + + # third processing + gedcomNamesListClear3 = [] + # iterate all original occupation information + for i in occuList2: + # iterate all the occupations sparred in it + for j in i: + # iterate the five possible keys ("occupation 1", ...) + for key in occuKeys: + # if the entry for the key does not contain any content, skip it + if j[key] == "": + continue; + # only professions that are "not found" + if j[key]["selection info"] == "not found": + gedcomNamesListClear3.append(j[key]["occupation"]) + + # parallelization + pool = Pool(3) + occuList3 = pool.map(partial(createOccuList, + existingVariantsKldB=existingVariantsKldB, + fieldnamesVariants=fieldnamesVariants, + fieldnamesDistance=fieldnamesDistance, + fieldnamesOccu=fieldnamesOccu, + filenameVariants=filenameVariants, + filenameDistance=filenameDistance, + filenameOccu=filenameOccu, + occuKeys=occuKeys, + filename=""), gedcomNamesListClear3) + pool.close() + pool.join() + + # creation of statistics for the three iterations + statistics(occuList, occuKeys) + statistics(occuList2, occuKeys) + statistics(occuList3, occuKeys) + + # storage of the time at the ending of the program run + finishtime = time.perf_counter() + + # status info + print("Status: Program finished in", round(finishtime - starttime, 2), "seconds(s)") diff --git a/2022_005_goldberg/Skripte/readme.txt b/2022_005_goldberg/Skripte/readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce2cf0970c947c9fae29dd0a330a83fff367d9d8 --- /dev/null +++ b/2022_005_goldberg/Skripte/readme.txt @@ -0,0 +1,45 @@ +readme + +Die folgende Anleitung soll eine Benutzung des Python-Skripts und eine Interpretation der Ergebnisse ermöglichen. + +Bibliotheken: +Damit das Programm ausgeführt werden kann sind ggf. noch weitere Bibliotheken lokal zu installieren. In den ersten Zeilen des Skripts sind die benutzten Bibliotheken angegeben. +Eingangsdateien mit Berufsangaben: +Das Programm ist darauf ausgelegt, zwei verschiedene Arten von Eingangsdateien zu bearbeiten: (1.) CSV-Dateien und (2.) GEDCOM-Dateien. Je nachdem welche Art vorliegt ist im Programm der Parameter „typeOfData“ im Programmcode auf „csv“ oder „ged“ zu setzen. +Liegen die Berufsangaben in einer CSV-Datei vor, so ist diese so zu strukturieren, dass sie eine Spalte enthält, in dessen erster Zeile die Überschrift „occupation“ steht. In den folgenden Zeilen folgen jeweils die zu lemmatisierten Berufsangaben. Am Ablageort des Skripts muss auch Ordner „data“ existieren, in dem die Datei ablegt ist. Sie trägt die Bezeichnung „occupations.csv“. +Falls die Berufsangaben in GEDCOM-Dateien vorliegen, so sind die GEDCOM-Dateien mit fortlaufenden Ziffern zu benennen („1.ged“, „2.ged“ etc.). Ziffern dürfen nicht doppelt genutzt werden. Auch diese Dateien werden im Unterordner „data“ platziert. + +Variantenliste: +Wie die Eingangsdatei mit den neuen Berufsangaben wird auch die CSV-Datei mit den bestehenden Varianten dem Unterordner „data“ hinzugefügt. +Die Bezeichnung der Datei muss "variants.csv" sein. Sie enthält drei Spalten, die die Überschriften „variant“ und „OhdAB_01“ tragen. In der ersten Spalte steht die textuelle Bezeichnung und in der zweiten der zugeordnete OhdAb-Cod. Falls ein anderes Klassifizierungssysteme angewendet wird, kann in der dritten Spalte auch jede beliebige Codierung genutzt werden – die Überschrift sollte dennoch nicht verändert werden. + +Parallelisierung: +Die Verarbeitung von GEDCOM-Dateien läuft parallel ab, um die Geschwindigkeit zu erhöhen. Hierzu kann festgelegt werden, wie viele Rechnerkerne genutzt werden. Dazu ist der Parameter der Funktion „Pool()“ jeweils zu verändern. Bleibt er leer, so werden alle verfügbare Rechenkerne genutzt. Im Skript ist die Anzahl der Kerne standardmäßig auf einen Kern festgelegt. + +Halbierung der Varianten: +Um die Halbierung der Varianten zu erreichen ist die Variable halving auf "yes" zu setzen. Diese Möglichkeit dient vorwiegend zu Testzwecken. +Weitere Iterationen mit den neuen Varianten: +Unter Hinzuziehung der neu lemmatisierten Berufsvarianten ist es wahlweise möglich weitere Berufsvarianten zu bearbeiten, zu der es in der ursprünglichen Variantenliste keinen Treffer gibt. Im Standard sind drei Iterationen angelegt. Sollen diese nicht stattfinden, sind die Code-Bestandsteile hinter dem Kommentar „second processing“ bzw. „third processing“ auszuklammern. + +Ausgabedateien: +Als Ergebnis wird die Datei „occuResult.csv“ erzeugt und während des Programmdurchlaufs fortlaufend aufgebaut. Die jeweiligen Spalten sind mit Tabstopps voneinander separiert. Diese enthält die Bezeichnung der überprüften Variante („variant“), bei GEDCOM-Dateien den Namen der Datei, in der diese vorkam („source“), die Anzahl der Häufigkeit dieser Variante in dieser Quelle („number“). In den weiteren Spalten finden sich für die einzelnen Berufsangaben (occupation1-occupation5) in der Bezeichnung verschiedene Informationen. Es werden maximal fünf einzelne Berufe identifiziert (z. B. aus der Angabe „Häusler und Fleischer und Gastwirt und Richter und Schenker“). Die Informationen sind wie folgend gegliedert: + +Schema: Bezeichnung - Erläuterung +occupation - bereinigte Berufsbezeichnung +KldB 2010 - OhdAB-Code, falls eine Zuordnung geschehen kann +best fit lemma - Bezeichnung der am besten passenden Variante +row of best fit lemma - Zeile der am besten passenden Variante in der Variantenliste +titel - aus der ursprünglichen Berufsangabe rausgefilterte Titularangabe +role - aus der ursprünglichen Berufsangabe rausgefilterte Rollenangabe +year - aus der ursprünglichen Berufsangabe rausgefilterte Jahresangabe +url - aus der ursprünglichen Berufsangabe rausgefilterte Angabe einer URL +location - aus der ursprünglichen Berufsangabe rausgefilterte Angabe eines Ortes +further info - aus der ursprünglichen Berufsangabe rausgefilterten sonstigen Angaben (waren in Klammern vorhanden) +selection info - Information, ob die Berufsangabe einer bestehenden Variante zugeordnet werden konnte („found direct“, „not found“, „found after levenshtein“, „no occupational designation“) +absolute distance - absolute Levenshtein-Distanz zur am besten passenden Variante +relative distance - relative Levenshtein-Distanz zur am besten passenden Variante + +Zudem wird im Unterordner „data“ eine Datei „newVariants.csv“ erzeugt, die die Bezeichnung der neuen Variante („variant“), die Bezeichnung der bereits bestehenden Variante („lemma“) sowie den Code der OhdAB („OhdAB_01“) enthält. + + +Jan Michael Goldberg, 22. Februar 2022 \ No newline at end of file