Skip to content
Snippets Groups Projects
Commit 3a7d71a4 authored by Martin Wiegand's avatar Martin Wiegand
Browse files

Ordner für 2022_005_Goldberg angelegt

parent 068f94f4
No related branches found
No related tags found
No related merge requests found
import csv
import os.path
from multiprocessing import Pool, current_process
from functools import partial
import time
import json
import Levenshtein
import copy
def loadData(filename, delimiter, encoding):
"""
This function is used to open files in which data is temporarily stored and was created by the program in a previous run.
:param filename: designation of the file
:param delimiter: type of delimiter as string
:return: list of dictionaries with information of the file to be loaded
"""
content = [] # list oft dicts
try:
with open(filename, "r", encoding=encoding) as data: # , errors='ignore'
for i in csv.DictReader(data, delimiter=delimiter):
i = json.loads(json.dumps(i))
content.append(i)
except FileNotFoundError:
print("Status: Inital pass for file", filename, "(no list created yet).")
return (content)
def appendFile(filename, data, fieldnames):
"""
This function describes CSV files.
:param filename: designation of the file (string)
:param data: type of delimiter (string)
:param fieldnames: column names of the file to be written (list of strings)
"""
opener = open(filename, "a", newline='', encoding="utf-8")
writer = csv.DictWriter(opener, fieldnames=fieldnames, delimiter="\t")
# differentiation of cases where one or more lines are to be added
# for the file "qualityofgedcom.csv" only one line should be written at a time
# for all other files several lines should be written
if filename == "qualityofgedcom.csv":
writer.writerow(data)
else:
writer.writerows(data)
opener.close()
def createFile(filename, fieldnames, delimiter, encoding):
"""
This function creates a new file if no file already exists under this name.
The function is also used to load data when it is clear that the file already exists.
:param filename: designation of the file (string)
:param fieldnames: column names of the file to be written (list of strings)
:param delimiter: type of delimiter (string)
:return: list of dictionaries with information of the file to be loaded
"""
content = loadData(filename, delimiter, encoding)
# create a new file if it is not there
if len(content) == 0: # check if the variable does not contain any data
opener = open(filename, "w", newline='', encoding="utf-8-sig")
writer = csv.writer(opener, delimiter=delimiter)
writer.writerow(fieldnames)
opener.close()
return (content)
def loadGedcomFile(filename):
"""
This function loads the data of a GEDCOM file and writes them line by line into a list.
:param filename: name of the file
:return: in case of error "NONE", otherwise a list with the information of the GEDCOM file
"""
# define file path
filepath = os.path.join("data", filename)
preparedData = []
try:
gedcom = open(filepath, "r", encoding="utf-8")
data = gedcom.readline()
# initial transfer of the headline
data = data[:-1] # delete the unimportant last character of each line
while data != "": # last line is empty
data = str(gedcom.readline())
data = data[:-1] # delete the unimportant last character of each line
preparedData.append(data)
gedcom.close()
return (preparedData)
except FileNotFoundError:
print("Error: There is a problem with access to the file", filename, ".")
return ("NONE")
def separator(occu, replaced, replacer):
"""
This function is used to replace separation operators.
:param occu: string that is processed
:param replaced: content to be replaced
:param replacer: place of the one to be replaced
:return: new string with changed content
"""
if replaced in occu:
occu = occu.replace(replaced, replacer)
return (occu)
def endOfString(phrase, signalWord):
"""
This function is used to detect the position of an element of a string.
The respective end position of a part is determined, if it exists.
Everything before this position is removed.
:param phrase: string to be searched (string)
:param signalWord: displays a place name (string)
:return: text after the end position of the signal word in the phrase
"""
# if phrase contains the signal word, then find end position of the signal word and remove everything behind
if signalWord in phrase:
endOfString = phrase[(phrase.find(signalWord) + len(signalWord)):]
return (endOfString)
return ("")
def replaceLoc(signalWord, phrase, loc):
"""
This function is used to store location names.
:param signalWord: displays a place name (string)
:param phrase: string to be searched (string)
:param loc: designation of a place (string)
:return: adjusted occupation phrase
"""
if signalWord in phrase:
phrase = phrase.replace(signalWord, "") # remove "signalWord"
phrase = phrase.replace(loc, "") # remote location
return (phrase)
def dictSearch(relevantDict, key, relevantObject):
"""
This function searches a given list of dictionaries for a searched value and specifies the key.
:param relevantDict: list of dictionaries that will be searched
:param key: key of the dictionary to be studied
:param relevantObject: name of the value to be searched for under the key in the Dictionary
:return: number of the searched dictionary in the list (if none is found "-1")
"""
# search per list comprehension
# note: upper and lower case is relevant here
occuIndex = next((index for (index, d) in enumerate(relevantDict) if d[key] == relevantObject), None)
if occuIndex is None:
return (-1) # if it could not be found
return (occuIndex)
def partCorrector(phrase, existingVariantsKldB):
"""
This function cleans up a location specification.
Information that is not related to the location will be filtered out.
In addition, an attempt is made to find a lemma for this occupation.
:param phrase: occupation (string)
:param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
:return: information about the occupation (dictionary)
"""
# initialization of variables, so that exist
titel = ""
role = ""
year = ""
url = ""
brackets = ""
# step 5: geographic prepositions ("loc" stands for location)
# find and save place names
# place name is currently overwritten if several of them occur
# there are signal words that indicate a subsequent location (e.g. "in", "im")
loc = endOfString(phrase, " in ") # "loc" is needed for the upcoming function
phrase = replaceLoc(" in ", phrase, loc)
loc = endOfString(phrase, " im ")
phrase = replaceLoc(" im ", phrase, loc)
loc = endOfString(phrase, " In ")
phrase = replaceLoc(" In ", phrase, loc)
loc = endOfString(phrase, " i. ")
phrase = replaceLoc(" i. ", phrase, loc)
loc = endOfString(phrase, " von ")
phrase = replaceLoc(" von ", phrase, loc)
loc = endOfString(phrase, " v. ")
phrase = replaceLoc(" v. ", phrase, loc)
loc = endOfString(phrase, " zu ")
phrase = replaceLoc(" zu ", phrase, loc)
loc = endOfString(phrase, " auf ")
phrase = replaceLoc(" auf ", phrase, loc)
loc = endOfString(phrase, " aus ")
phrase = replaceLoc(" aus ", phrase, loc)
loc = endOfString(phrase, " Aus ")
phrase = replaceLoc(" Aus ", phrase, loc)
loc = endOfString(phrase, " an ")
phrase = replaceLoc(" an ", phrase, loc)
loc = endOfString(phrase, " der ")
phrase = replaceLoc(" der ", phrase, loc)
loc = endOfString(phrase, " des ")
phrase = replaceLoc(" des ", phrase, loc)
loc = endOfString(phrase, " van ")
phrase = replaceLoc(" van ", phrase, loc)
# besides location information there are signal words for employers
# "loc" continues to be used here, even though the literal sense no longer fits here
loc = endOfString(phrase, " bei ", )
phrase = replaceLoc(" bei ", phrase, loc)
loc = endOfString(phrase, " bei dem ")
phrase = replaceLoc(" bei dem ", phrase, loc)
loc = endOfString(phrase, " beim ")
phrase = replaceLoc(" beim ", phrase, loc)
loc = endOfString(phrase, " bei der ")
phrase = replaceLoc(" bei der ", phrase, loc)
# then there are signal words in front of an occupation, which makes clear the affiliation to a dominion
affiliation = ["herrschaftlich", "herrschaftliche", "herrschaftlicher", "königlich", "königliche", "königlicher",
"fürstlich", "fürstliche", "fürstlicher"]
for i in affiliation:
if i in phrase:
# this information should not be deleted from the occupation statement
# it should only be stored in "loc" to be output separately afterwards
# if "loc" is empty, then no comma should precede it
if loc != "":
loc = loc + ", " + i
else:
loc = i
# find and save years
# more detailed dates are made to year information
# assumption: Year numbers always have four digits and are at the beginning
# check if the first character is a number
if phrase[:1].isdigit() is True:
# check if the first four characters are a number
if phrase[:4].isdigit() is True:
# separate year and part behind
year = phrase[:4]
phrase = phrase[4:]
# brackets content
if "(" in phrase and ")" in phrase:
brackets = phrase[phrase.find("("):phrase.find(")")]
phrase = phrase[:phrase.find("(")] + phrase[phrase.find(")") + 2:] # +2 because of parenthesis and space
if "[" in phrase and "]" in phrase:
brackets = phrase[phrase.find("["):phrase.find("]")]
phrase = phrase[:phrase.find("[")] + phrase[phrase.find("]") + 2:] # +2 because of parenthesis and space
# find and save URLs
# example: <a href="https:undde.wikipedia.org/wiki/Geschichte_des_Kantons_Thurgau#Grafen_im_Thurgau">Graf im Thurgau</a>
if "<a" in phrase and "</a>" in phrase:
url = phrase[phrase.find("<a"):phrase.find("</a>")]
phrase = phrase[:phrase.find("<a")] + phrase[phrase.find("</a>"):]
# find and save role
# wife
if "F. d." in phrase:
role = "Frau"
phrase = endOfString(phrase, "F. d.")
if "Ehefrau des" in phrase:
role = "Frau"
phrase = endOfString(phrase, "Ehefrau des")
if "Ehefrau d." in phrase:
role = "Frau"
phrase = endOfString(phrase, "Ehefrau d.")
if "Ehefrau" in phrase:
role = "Frau"
phrase = endOfString(phrase, "Ehefrau")
if "frau" in phrase and "Haus" != phrase[:4] and "Acker" != phrase[:5]:
role = "Frau"
phrase = phrase.replace("sfrau", "")
phrase = phrase.replace("frau", "")
# daugther
if "T. d." in phrase:
role = "Tochter"
phrase = endOfString(phrase, "T. d.")
if "tochter" in phrase:
role = "Tochter"
phrase = phrase.replace("stochter", "")
phrase = phrase.replace("tochter", "")
# son
if "S. d." in phrase:
role = "Sohn"
phrase = endOfString(phrase, "S. d.")
if "sohn" in phrase:
role = "Sohn"
phrase = phrase.replace("ssohn", "")
phrase = phrase.replace("sohn", "")
# find and save titles
if "Prof." in phrase:
titel = "Professor"
phrase = endOfString(phrase, "Prof.")
if "Professor" in phrase:
titel = "Professor"
phrase = endOfString(phrase, "Professor")
# step 9: temporal prepositions and numerals
if " am " in phrase:
year = endOfString(phrase, " am ")
phrase = phrase.replace(" am ", "")
phrase = phrase.replace(year, "")
if " bis " in phrase:
year = endOfString(phrase, " bis ")
phrase = phrase.replace(" bis ", "")
phrase = phrase.replace(year, "")
# delete numbers, unless they end with a dot or there are 4 consecutive digits, then this is taken as year
numbers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
numberLength = 0
prePart = phrase
for i in range(len(phrase)):
if prePart[i:i + 1] in numbers:
numberLength = numberLength + 1
if prePart[i + 1:i + 2] != "." and prePart[i + 1:i + 2] not in numbers:
if numberLength == 4:
year = prePart[i - 3:i + 1]
phrase = phrase.replace(year, "")
numberLength = 0
else:
phrase = phrase.replace(phrase[i - numberLength + 1:i + 1], "")
numberLength = 0
elif phrase[i + 1:i + 2] == ".":
numberLength = 0
# remove remaining special characters
phrase = phrase.replace(":", "")
# remove blanks here again
# "cleanedOccupation" is what remains of the occupation specification
cleanedOccupation = phrase.strip()
# search if there is a corresponding pedant in the already classified occupational data
occuIndex = dictSearch(existingVariantsKldB, "variant", cleanedOccupation)
# if occuIndex is not "-1", then a counterpart was found
if occuIndex != -1:
# KldB identifier
kldb = existingVariantsKldB[occuIndex]["code"]
# way of selection of a counterpart
info = "found direct"
# levDict stands in front of Levenshtein dictionary
# name not appropriate here, because no Levenshtein distance is used
# for uniformity of the variable it is used anyway
levDict = {"lemma row": occuIndex, # line of the matching dictionary
"variant": "",
"best fit lemma": existingVariantsKldB[occuIndex]["variant"],
# designation of the appropriate occupation
"absolute distance": "",
"relative distance": "",
"selection": ""
}
# if occuIndex is "-1", no counterpart was found and a similarity analysis starts
elif occuIndex == -1 and cleanedOccupation != "": # cleanedOccupation must not be empty
# similarity analysis
levDict = levenshteinDist(existingVariantsKldB, "variant", cleanedOccupation, "code")
# setting the relative Levenshtein distance of 0.25 as the essential threshold for selection
if levDict["relative distance"] < 0.25:
levDict.update({"selection": 1})
kldb = existingVariantsKldB[levDict["lemma row"]]["code"] # take the line here from the levDict
# way of selection of a counterpart
info = "found after levenshtein"
else:
# no counterpart found
levDict.update({"selection": 0})
kldb = ""
info = "not found"
# no occupation remains
else:
kldb = ""
info = "no occupational designation"
levDict = {"lemma row": "", "variant": "", "best fit lemma": "", "absolute distance": "",
"relative distance": "", "selection": ""}
# store the information sorted for each phrase (occupation)
occupationResult = {
"occupation": cleanedOccupation,
"best fit lemma": levDict["best fit lemma"],
"row of best fit lemma": levDict["lemma row"],
"KldB 2010": kldb,
"titel": titel,
"role": role,
"location": loc,
"year": year,
"url": url,
"further info": brackets,
"selection info": info,
"similarity analysis": levDict,
"lemma row": levDict["lemma row"],
"absolute distance": levDict["absolute distance"],
"relative distance": levDict["relative distance"]
}
return (occupationResult)
def abbreviationsCorrector(firstString, secondString):
"""
This function compares two phrases and checks if one of them could be an abbreviation of the other.
If "s"econdString" is an abbreviation of "firstString", "firstString" will be returned truncated.
:param firstString: first phrase without abbreviation (string)
:param secondString: second phrase with abbreviation (string)
:return: resolved abbreviation of "firstString" (string)
"""
# continue only if there is a dot in "secondString"
# first letters equal to runtime improvement
if "." in secondString and secondString[:1] == firstString[:1]:
positionDot = secondString.find(".")
# find the abbreviated part in the other string and delete it in the original name
# count backwards to find blanks
for position in range(positionDot, 0, -1):
if secondString[positionDot:positionDot + 1] == " ":
beforeDot = secondString[position:positionDot]
break;
elif position == 1:
beforeDot = secondString[:positionDot]
# testing minimum length
try:
# minimum length before 3 letters
if positionDot - position < 4:
# if less than three letters, return original value
return (firstString)
except UnboundLocalError:
position = 0
beforeDot = secondString[position:positionDot]
# minimum length before 3 letters
if positionDot - position < 4:
# if less than three letters, return original value
return (firstString)
if beforeDot in firstString:
positionPart = firstString.find(beforeDot) + len(beforeDot)
for position in range(positionPart, len(firstString) + 1):
# blank, hyphen or general end; +1 is allowed here, is then simply empty
if firstString[position:position + 1] == " " or firstString[
position:position + 1] == "-" or position == len(
firstString):
positionEnd = position
break;
# abbreviation found, abbreviate original name
firstString = firstString[:positionPart] + ". " + firstString[positionEnd:]
return (firstString)
def levenshteinDist(existingVariantsKldB, key, relevantObject, keyRelevantDict):
"""
This function generates the Levenshtein distance between two strings.
:param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
:param key: designation of the key for the "relevantDict" (string)
:param relevantObject: occupation for which a similar, already classified value is to be found (string)
:param keyRelevantDict: name of the column that contains for the identifier (string)
:return: information on similarity analysis (dictionary)
"""
# the best fitting value is to be found
# initial high values for a Levenshtein distance, which are undercut in any case
minimalDistAbs = 99999 # absolute
minimalDistRel = 99999 # relative
# binary variable, 0 if no hit was found, 1 if at least one hit was found
minOneFound = 0
# check against each existing entry
for counter, i in enumerate(existingVariantsKldB):
# Lower case for better comparability
relevantObjectLowerCase = relevantObject.lower()
existingVariantLowerCase = copy.copy(
i[key]).lower() # copy important because it is changed afterwards
# compare only if first letters are the same (serves to improve runtime)
if existingVariantLowerCase[:1] == relevantObjectLowerCase[:1]:
# calculate Levenshtein distance
levDistAbs = Levenshtein.distance(existingVariantLowerCase, relevantObjectLowerCase)
# levDist multiply with number of blanks (+1) to avoid "gewesener königlicher Richter"/"gewesener königlicher Koch"
levDistRel = levDistAbs * (relevantObject.count(" ") + 1) / len(relevantObject)
# when the next one fits better
if levDistRel < minimalDistRel:
minimalDistAbs = levDistAbs
minimalDistRel = levDistRel
bestFitLemma = i[key]
cacheCounter = counter
# is overwritten until an equal one comes along
hitlist = [[i[key], cacheCounter]]
# if the next one fits equally well
if levDistRel == minimalDistRel:
hitlist.append([i[key], counter])
# at least one hit
minOneFound = 1
# no similarity
else:
continue;
# select one in case of multiple hits
# selection is made by greatest match from the front (matching letters)
try:
# if there were several hits of the same quality
# anything above 0.25 is assumed to be unrealistic here, serves to improve runtime
if len(hitlist) > 1 and minimalDistRel < 0.25:
# initialization of counters
numberMatchingChars = 0
maxNumberMatchingChars = 0
numberMatchingCharsList = []
for charPosition, j in enumerate(hitlist):
# if the respective letters of the strings to be compared are the same
if j[0][charPosition:charPosition + 1] == relevantObject[charPosition:charPosition + 1]:
# count up
numberMatchingChars = numberMatchingChars + 1
# note the maximum number of characters
maxNumberMatchingChars = numberMatchingChars
# reset, if another character comes
else:
numberMatchingChars = 0
numberMatchingCharsList.append([charPosition, maxNumberMatchingChars])
# Selection of the result with the closest match (no longer has anything to do with Levenshtein distance)
longestMatch = 0
# iterate all results of the maxNumberMatchingCharsList
for j in numberMatchingCharsList:
# select so most suitable
if j[1] > longestMatch: # [1] is maxNumberMatchingChars
longestMatch = j[1]
charPosition = j[0] # [0] is charPosition
# there can be best results for the same time
# that is ignored at this point
# only one status message is issued
# the second, equally matching value, is not selected
# if j[1] == longestMatch:
# this may be due to the fact that equal values are compared
# duplicates exist in the list of already classified occupational data
# therefore values to be compared can be the same
# if hitlist[j[0]][0] == hitlist[charPosition][0]:
# print("Status: A dublette exists in the list of possible hits(" + hitlist[j[0]][0] + ", " + hitlist[charPosition][0] + ")")
# continue
# but the values do not always have to be the same, they can also just have the same beginning
# print("Status: Two very similar values exist in the list of possible hits(" + hitlist[j[0]][0] + ", " + relevantObject + ")")
# overwrite the relevant variables
bestFitLemma = hitlist[charPosition][0]
cacheCounter = hitlist[charPosition][1]
except UnboundLocalError:
pass;
# alternative, if the possibility above did not lead to success
# this may be due to the fact that abbreviations are included
if minimalDistRel >= 0.25:
# search for abbreviations marked with a dot
for counter, i in enumerate(existingVariantsKldB):
designationCopy = relevantObject.lower()
originalDesignation = copy.copy(i[key]).lower() # copy important because it is changed afterwards
# only if first letters are equal (runtime improvement)
if originalDesignation[:1] == designationCopy[:1]:
# abbreviation handling
preDesignationCopy = designationCopy # save previous value
designationCopy = abbreviationsCorrector(designationCopy, originalDesignation)
if designationCopy == preDesignationCopy:
# the same again the other way around
originalDesignation = abbreviationsCorrector(originalDesignation, designationCopy)
levDist = Levenshtein.distance(originalDesignation, designationCopy)
if levDist < minimalDistAbs: # minimalDistRel
minimalDistAbs = levDist # minimalDistRel
# if the new value is smaller, then overwrite relevant variables
bestFitLemma = i[key]
cacheCounter = counter
# at least one hit
minOneFound = 1
if minOneFound == 0:
bestFitLemma = "nothing" # occurs, if e.g. the first letter is a colon; there is no variant to
cacheCounter = -1
# merge information
levenDict = {
"lemma row": cacheCounter,
"variant": relevantObject,
"best fit lemma": bestFitLemma,
"absolute distance": minimalDistAbs,
"relative distance": minimalDistRel
}
return (levenDict)
def occuCleaner(occu, existingVariantsKldB):
"""
This function cleans up individual occupation information.
It is also essential that various information is separated from the original job title.
This can concern several job titles, but also non-professional information.
:param occu: occupational title
:param existingVariantsKldB: already classified occupation KldB (list of dictionaries)
:return: information about the different occupational indications in the original indication (dictionary)
"""
# storage of the original occupational title
originalOccu = occu
# print(occu)
# initialization
# "occu1" does not need to be initialized because there is at least one occupation specification
occu2 = {} # ""
occu3 = {} # ""
occu4 = {} # ""
occu5 = {} # ""
# initialization
part1 = ""
part2 = ""
part3 = ""
part4 = ""
part5 = ""
# general preprocessing
# step 1: Remove spaces at the beginning and end
occu = occu.strip()
# step 2: Write out abbreviations
if "mstr." in occu:
occu = occu.replace("mstr.", "meister")
if "Ing." in occu:
occu = occu.replace("Ing.", "Ingenieur")
# step 3: Normalize separation operators
occu = separator(occu, " u.", " und")
occu = separator(occu, "+", " und ") # there are also "und" (and) without spaces
occu = separator(occu, ", ", " und ")
occu = separator(occu, ",", " und ")
occu = separator(occu, "; ", " und ")
occu = separator(occu, " & ", " und ")
occu = separator(occu, " / ", " und ")
occu = separator(occu, "/", " und ")
# detail processing
# separate multiple occupations
partList = [part1, part2, part3, part4, part5] # parts are still all empty here
partCounter = 0
separation = " und "
partList[0] = occu # is needed for initialization because the while loop accesses the next one
# < 4, because not infinite parts should be made
while separation in partList[partCounter] and partCounter < 4:
st = partList[partCounter]
# exeptation: do not seperate when "-" before "und", f. e. "Kauf- und Handelsmann", or in "k. u. k."
if "- und " not in st and "k. und k." not in st:
partList[partCounter] = st[:st.find(" und ")] # first part
partList[partCounter + 1] = st[(st.find(" und ") + len(" und ")):] # second part
partCounter = partCounter + 1
# write back values from the partList
part1 = partList[0]
part2 = partList[1]
part3 = partList[2]
part4 = partList[3]
part5 = partList[4]
if partCounter == 0: # if there is only one part
part1 = occu
# the content of the individual professional data is added to the dictionary afterwards
# only fill in if there is really content there
occu1 = partCorrector(part1, existingVariantsKldB)
if part2 != "": # if there is no part2, then just keep going
occu2 = partCorrector(part2, existingVariantsKldB)
if part3 != "": # can only be if there was a part2 beforehand
occu3 = partCorrector(part3, existingVariantsKldB)
if part4 != "":
occu4 = partCorrector(part4, existingVariantsKldB)
if part5 != "":
occu5 = partCorrector(part5, existingVariantsKldB)
# information about the different occupational indications in the original indication
occuDictOfDicts = {
"variant": originalOccu,
"occupation 1": occu1, # occu1 is a dictionary with occupation information
"occupation 2": occu2,
"occupation 3": occu3,
"occupation 4": occu4,
"occupation 5": occu5
}
return (occuDictOfDicts)
def statistics(occuList, occuKeys):
"""
This function counts the number of lemmatizations over the different process branches.
:param occuList: list of dictionaries with information to analysed occupational information
:param occuKeys: column headings for the analysis of separated occupations
"""
# initialization of counters
counter = 0 # found directly in existing variants
counter0 = 0 # empty occupational designations (only came about as a result of cleanup, e.g. because only location information was given)
counter2 = 0 # found by Levenshtein distance
counter3 = 0 # could not be found
counter4 = 0 # found by Levenshtein distance NV
counter5 = 0 # found directly in existing variants NV
for i in occuList:
try:
# iterate the five possible keys ("occupation 1", ...)
for key in occuKeys:
# if the entry for the key does not contain any content, skip it
if i == [] or i[key] == {}:
continue;
elif i[key]["selection info"] == "found direct":
counter = counter + i["number"]
elif i[key]["selection info"] == "found after levenshtein":
counter2 = counter2 + i["number"]
elif i[key]["selection info"] == "not found":
counter3 = counter3 + i["number"]
elif i[key]["selection info"] == "no occupational designation":
counter0 = counter0 + i["number"]
elif i[key]["selection info"] == "found after levenshtein NV":
counter4 = counter4 + i["number"]
elif i[key]["selection info"] == "found direct NV":
counter5 = counter5 + i["number"]
else:
print("Error: Selection information is missing.")
except:
# iterate the five possible keys ("occupation 1", ...)
for key in occuKeys:
# if the entry for the key does not contain any content, skip it
if i == [] or i[0][key] == {}:
continue;
elif i[0][key]["selection info"] == "found direct":
counter = counter + i[0]["number"]
elif i[0][key]["selection info"] == "found after levenshtein":
counter2 = counter2 + i[0]["number"]
elif i[0][key]["selection info"] == "not found":
counter3 = counter3 + i[0]["number"]
elif i[0][key]["selection info"] == "no occupational designation":
counter0 = counter0 + i[0]["number"]
elif i[0][key]["selection info"] == "found after levenshtein NV":
counter4 = counter4 + i[0]["number"]
elif i[0][key]["selection info"] == "found direct NV":
counter5 = counter5 + i[0]["number"]
else:
print("Error: Selection information is missing.")
# output of statistical information
counterSum = counter0 + counter + counter2 + counter3 + counter4 + counter5
print("Status: Proportion of adjusted occupations found directly in the variants:", counter / (counterSum + 1),
counter)
print("Status: proportion of adjusted occupations found directly in the variants NV:",
counter5 / (counterSum + 1),
counter5)
print("Status: Proportion of adjusted occupations found with Levensthein distance:", counter2 / (counterSum + 1),
counter2)
print("Status: Proportion of adjusted occupations found with Levensthein distance NV:", counter4 / (counterSum + 1),
counter4)
print("Status: Proportion of adjusted occupations not found", counter3 / (counterSum + 1), counter3)
print("Status: Proportion of empty job titles (through cleanup)", counter0 / (counterSum + 1), counter0)
def preCreateOccuList(filename, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys):
"""
This function creates a list of occupational information available in a GEDCOM file.
:param filename: designation of the file (string)
:param existingVariantsKldB: data on the already classified occupation information
:param fieldnamesVariants: column headings of the newVariants.csv file
:param fieldnamesDistance: column headings of the levenshteinDistance.csv
:param fieldnamesOccu: column headings of the occuResult.csv file
:param filenameVariants: path and name of the newVariants.csv file
:param filenameDistance: path and name of the levenshteinDistance.csv
:param filenameOccu: path and name of the occuResult.csv file
:param occuKeys: keys for the separated professions
:return: list with location information
"""
# a loop with one pass is necessary to be able to formulate a termination condition
for start in range(1):
# saving the name of the parallelization process
spawnPoolWorker = current_process().name
# loading data of a GEDCOM file
data = loadGedcomFile(filename)
# status information
print(spawnPoolWorker, "Status: The analysis of the occupational data for file", filename, "begins.")
# list of all occupations in one source
allOccupationsInSource = []
# iteration of each line in the GEDCOM file
for counter, i in enumerate(data):
# continue if OCCU tag is present
if i[2:6] == "OCCU":
occupation = i[7:]
# some files have the anomaly that the OCCU tag is empty, but the profession information is in the PLAC tag below it
# if this is the case, the information of the next line should be used
if occupation == "":
occupation = data[counter + 1][7:]
allOccupationsInSource.append(occupation)
# function must be executed iteratively, because otherwise it is called via parallelization
occuList = []
# avoid dublets
dubletCounterDict = {}
avoidDublettesList = []
for i in allOccupationsInSource:
# if the variant has already been edited, it should not be edited again
# however, a counter should then be inplemented, which documents the number
if i in avoidDublettesList: # comparison with already processed variants
# count up
dubletCounterDict.update({i: dubletCounterDict[i] + 1})
# update of the occuList
# searching for the right row
occuListPosition = next((item for item in occuList if item["variant"] == i), None)
occuListPosition["number"] = dubletCounterDict[i]
# if the occupation information has not yet been processed, then this should be done as follows
else: # occupation statement for the first time in this source
dubletCounterDict.update({i: 1})
# extension of the list of processed designations
result = createOccuList(i, existingVariantsKldB, filename, dubletCounterDict)
occuList.append(result[0]) # "[0]" at the end is necessary because the function returns a list
avoidDublettesList.append(result[0]["variant"])
printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys)
return (occuList)
def createOccuList(phrase, existingVariantsKldB, filename, dubletCounterDict):
"""
This function creates a list of location information available in a source.
:param phrase: occupational designation (string)
:param filename: designation of the file (string)
:param existingVariantsKldB: data on the already classified occupation information (list)
:param filenameVariants: path and name of the newVariants.csv file (string)
:param filename: designation of the file (string)
:param dubletCounterDict: number of same occupational designations in a source (dictionary)
:return: list with occupational information
"""
# a loop with one pass is necessary to be able to formulate a termination condition
for start in range(1):
# create a list with information about the new variants
occuList = [] # list of unadjusted variants in the source (list entries are dictionaries with a lot of information)
designationList = [] # list of adjusted variants in the source
# if the variant has already been edited, it should not be edited again
# however, a counter should then be inplemented, which documents the number
if phrase in designationList: # comparison with already processed variants
# search for the entry in the occuList that matches the variant
for j in occuList:
if j["variant"] == phrase:
# count up number
j["number"] = j["number"] + 1
# skip processing
continue;
# if the occupation information has not yet been processed, then this should be done as follows
else: # occupation statement for the first time in this source
# extension of the list of processed designations
designationList.append(phrase)
# variant cleanup
resultOccucleaner = occuCleaner(phrase, existingVariantsKldB)
# completing the file name and setting the occurrence to 1
resultOccucleaner.update({"source": filename})
try:
resultOccucleaner.update({"number": dubletCounterDict[phrase]})
except:
resultOccucleaner.update({"number": 1})
# adding to the occuList in a dictionary
occuList.append(resultOccucleaner)
return (occuList)
def printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys):
"""
This function creates a list of location information available in a source.
:param occuList: information to the occupational designations (list of dictionaries)
:param existingVariantsKldB: data on the already classified occupation information (list of dictionaries)
:param fieldnamesVariants: column headings of the newVariants.csv file (list of strings)
:param fieldnamesDistance: column headings of the levenshteinDistance.csv (list of strings)
:param fieldnamesOccu: column headings of the occuResult.csv file (list of strings)
:param filenameVariants: path and name of the newVariants.csv file (string)
:param filenameDistance: path and name of the levenshteinDistance.csv (string)
:param filenameOccu: path and name of the occuResult.csv file (string)
:param occuKeys: keys for the separated professions (list of strings)
:return: nothing! (only execution of print orders)
"""
# loading data of new variants
# this is necessary every time, because an identical job title can occur in one of the parallel processes
newVariants = loadData(filenameVariants, "\t", "latin1")
# if the selection was made on the basis of the Levenshtein distance, this information should be saved
# two lists are created for this purpose
levenList = [] # list is used to create the content for a new row in newVariants.csv
levenList2 = [] # list is used to create the content for a new line in "levenshteindistance.csv
# Iteration per occupation specification in the source
for i in occuList:
# check all five possible separated professions
for key in occuKeys:
# if entry for the key is not filled in, then skip it
if i[key] == {}: # "":
continue;
if i[key]["selection info"] == "found after levenshtein":
newDict = {
"variant": i[key]["occupation"],
"lemma": existingVariantsKldB[i[key]["row of best fit lemma"]]["variant"],
"code": i[key]["KldB 2010"]
}
levenList.append(newDict)
elif i[key]["selection info"] == "found after levenshtein NV":
newDict = {
"variant": i[key]["occupation"],
"lemma": newVariants[i[key]["row of best fit lemma"]]["variant"],
"code": i[key]["KldB 2010"]
}
levenList.append(newDict)
if i[key]["similarity analysis"] != "": # for levenshteinDistance.csv
levenList2.append(i[key]["similarity analysis"])
# blocked printing of new lines in the files
# all files should be at the same level
# so if an error occurs with a variable, all files are not written to
# try:
# unpack dictionary information
unpackInfoList = []
for j in occuList:
# iterate j (occupations)
unpackInfoDict = {}
for i in j:
# contents of the dictionary are unpacked and written into individual fields
if type(j[i]) == dict:
for dictKey in j[i]:
if dictKey == "absolute distance":
unpackInfoDict.update(
{str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["absolute distance"]})
if dictKey == "relative distance":
unpackInfoDict.update(
{str(i) + "-" + str(dictKey): j[i]["similarity analysis"]["relative distance"]})
else:
unpackInfoDict.update({str(i) + "-" + str(dictKey): j[i][dictKey]})
# delete "similarity analysis" if it is there
try:
unpackInfoDict.pop(i + "-similarity analysis")
except:
pass
# if it is not a dictionary, then the content is taken over like this
else:
unpackInfoDict.update({i: j[i]})
unpackInfoList.append(unpackInfoDict)
appendFile(filenameOccu, unpackInfoList, fieldnamesOccu)
appendFile(filenameVariants, levenList, fieldnamesVariants)
# appendFile(filenameDistance, levenList2, fieldnamesDistance)
# except:
# print(
# "Error: Blocked printing of the lines failed. Manual deletion of the entries of the last file appropriate.")
if __name__ == '__main__':
# part up to 'parallelization' is executed once at the beginning
inputDataType = "ged" # data type in which the input data is available, "ged" and "csv" are possible
# storage of the time at the beginning of the program run
starttime = time.perf_counter()
if inputDataType == "csv":
pass
elif inputDataType == "ged":
# loading the sources (exemplary here: GEDCOM files from GEDBAS)
# definition of the range in which the file names are located (e.g. 1.ged to 60000.ged)
begin = 0
end = 60000
# creation of a list with the possible file names
gedcomNamesList = []
while begin != end:
datename = str(begin) + ".ged"
gedcomNamesList.append(datename)
begin = begin + 1
# check if the files exist
# exclude non-existent files
gedcomNamesListClear = []
for i in gedcomNamesList:
# files are located in the 'data' subfolder
filepath = os.path.join("data", i)
try:
# if opening works, the file exists and is added to a new list
gedcom = open(filepath, "r", encoding="utf-8")
gedcom.close()
gedcomNamesListClear.append(i)
except FileNotFoundError:
pass
# open more context data
# data from the Historical Data Center of Saxony-Anhalt
# classification based on the Klassifikation der Berufe (KldB, Classification of Professions)
# data from another classification system can also be used here
# file contains already classified occupational variants
filename = os.path.join("data", "variants.csv")
fieldnames = ["idVariant", # unique ID of the occupational variant
"variant", # textual representation of the variant
"code" # code of the OhdAB
]
# loading data from existing file
# if no file exists, a new one is created
existingVariantsKldB = createFile(filename, fieldnames, ";", "latin1")
# status message on the number of existing variants
print("Status:", len(existingVariantsKldB), "classified variants already exist.")
# if halving of variants is to be done for testing purposes, set halving to "yes"
halving = "yes"
# deletion of every second already classified occupation information
if halving == "yes":
remainingVariantsKldB = []
for number, i in enumerate(existingVariantsKldB):
if number % 2 == 0:
remainingVariantsKldB.append(i)
print("Status: There has been a halving of the variants for testing purposes.", len(remainingVariantsKldB),
"variants remain.")
# overwrite the variable of all variants
existingVariantsKldB = remainingVariantsKldB
# create file for saving the newly classified files
filenameVariants = os.path.join("data", "newVariants.csv")
fieldnamesVariants = ["variant", # designation of the new variant of an occupation
"lemma", # existing designation of an occupation to which the new variant is assigned
"code" # code according to KldB
]
createFile(filenameVariants, fieldnamesVariants, "\t", "latin1")
# list about the best hits for each checked job title
filenameDistance = "levenshteinDistance.csv"
fieldnamesDistance = ["relative distance", # absolute Levenshtein distance divided by the length of the variant
"absolute distance", # absolute Levenshtein distance
"variant", # designation of the new variant of an occupation
"best fit lemma", # designation of the best fitting existing variant
"selection", # binary information whether the lemma was selected (1 means yes, 0 means no)
"lemma row" # number of the line in the existing variants
]
# createFile(filenameDistance, fieldnamesDistance, "\t", "latin1")
# list for dividing the different components of a job specification
filenameOccu = "occuResult.csv"
fieldnamesOccu = ["variant", # designation of the new variant of an occupation
"source", # name of the file in which the variant occurs (source)
"number", # Number of occurrences of the variant in the source
"occupation 1-occupation", # information about the first occupation found
"occupation 1-KldB 2010",
"occupation 1-best fit lemma",
"occupation 1-row of best fit lemma",
"occupation 1-titel",
"occupation 1-role",
"occupation 1-year",
"occupation 1-url",
"occupation 1-location",
"occupation 1-further info",
"occupation 1-selection info",
"occupation 1-lemma row",
"occupation 1-absolute distance",
"occupation 1-relative distance",
"occupation 2-occupation", # information about the second occupation found0
"occupation 2-KldB 2010",
"occupation 2-best fit lemma",
"occupation 2-row of best fit lemma",
"occupation 2-titel",
"occupation 2-role",
"occupation 2-year",
"occupation 2-url",
"occupation 2-location",
"occupation 2-further info",
"occupation 2-selection info",
"occupation 2-similarity analysis",
"occupation 2-lemma row",
"occupation 2-absolute distance",
"occupation 2-relative distance",
"occupation 3-occupation", # information about the third occupation found
"occupation 3-KldB 2010",
"occupation 3-best fit lemma",
"occupation 3-row of best fit lemma",
"occupation 3-titel",
"occupation 3-role",
"occupation 3-year",
"occupation 3-url",
"occupation 3-location",
"occupation 3-further info",
"occupation 3-selection info",
"occupation 3-lemma row",
"occupation 3-absolute distance",
"occupation 3-relative distance",
"occupation 4-occupation", # information about the fourth occupation found
"occupation 4-KldB 2010",
"occupation 4-best fit lemma",
"occupation 4-row of best fit lemma",
"occupation 4-titel",
"occupation 4-role",
"occupation 4-year",
"occupation 4-url",
"occupation 4-location",
"occupation 4-further info",
"occupation 4-selection info",
"occupation 4-lemma row",
"occupation 4-absolute distance",
"occupation 4-relative distance",
"occupation 5-occupation", # information about the fifth occupation found
"occupation 5-KldB 2010",
"occupation 5-best fit lemma",
"occupation 5-row of best fit lemma",
"occupation 5-titel",
"occupation 5-role",
"occupation 5-year",
"occupation 5-url",
"occupation 5-location",
"occupation 5-further info",
"occupation 5-selection info",
"occupation 5-lemma row",
"occupation 5-absolute distance",
"occupation 5-relative distance",
]
createFile(filenameOccu, fieldnamesOccu, "\t", "latin1")
# definition of the keys for the separated professions
occuKeys = ["occupation 1", "occupation 2", "occupation 3", "occupation 4", "occupation 5"]
# initialization of a list in which the results of the upcoming parallelized process are stored
# this will process a list of occupation details in parallel
# the result is a list of dictionaries containing different information about the analysis (occuList)
occuList = []
# parallelization
if inputDataType == "csv":
occupationsList = loadData("occupations.csv", ";", "utf-8-sig")
listOfOccupations = []
for i in occupationsList:
listOfOccupations.append(i["occupation"])
if inputDataType == "ged":
pass
pool = Pool(1) # number of cores used is variable
dubletCounterDict = {}
if inputDataType == "csv":
# doppelte Berufsangaben zählen
designationList = [] # list of adjusted variants in the source
for occupation in listOfOccupations:
# if the variant has already been edited, it should not be edited again
# however, a counter should then be inplemented, which documents the number
if occupation in designationList: # comparison with already processed variants
# count up
dubletCounterDict.update({occupation: dubletCounterDict[occupation] + 1})
# if the occupation information has not yet been processed, then this should be done as follows
else: # occupation statement for the first time in this source
dubletCounterDict.update({occupation: 1})
# extension of the list of processed designations
designationList.append(occupation)
# doppelte Berufsangaben löschen
print("Status: Using a csv file with", len(listOfOccupations), "occupations")
listOfOccupations = set(listOfOccupations)
print("Status: File contains", len(listOfOccupations), "different occupational titles")
for row in listOfOccupations:
occuList.append(createOccuList(row, existingVariantsKldB, "occupations.csv", dubletCounterDict)[0])
printOccuList(occuList, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys)
elif inputDataType == "ged":
occuList = pool.map(partial(preCreateOccuList,
existingVariantsKldB=existingVariantsKldB,
fieldnamesVariants=fieldnamesVariants,
fieldnamesDistance=fieldnamesDistance,
fieldnamesOccu=fieldnamesOccu,
filenameVariants=filenameVariants,
filenameDistance=filenameDistance,
filenameOccu=filenameOccu,
occuKeys=occuKeys), gedcomNamesListClear)
else:
print("Error: No valide inputDataType")
pool.close()
pool.join()
# second processing loop for the designations that are not found but have components
# Example: "farmer and craftsman" is not found, but "farmer" and "craftsman" are found individually
print("Status: Second processing started")
# second processing
gedcomNamesListClear2 = []
# iterate all original occupation information
for i in occuList:
# iterate the five possible keys ("occupation 1", ...)
for key in occuKeys:
# if the entry for the key does not contain any content, skip it
try:
if i[key] == {}:
continue
# only professions that are "not found"
if i[key]["selection info"] == "not found":
gedcomNamesListClear2.append(i[key]["occupation"])
except: # if it is still in a list with only one value
if i == [] or i[0][key] == {}:
continue
# only professions that are "not found"
if i[0][key]["selection info"] == "not found":
gedcomNamesListClear2.append(i[0][key]["occupation"])
# parallelization
pool = Pool(1) # number of cores used is variable
occuList2 = pool.map(partial(createOccuList,
existingVariantsKldB=existingVariantsKldB,
dubletCounterDict=dubletCounterDict,
filename="second try"), gedcomNamesListClear2)
pool.close()
pool.join()
# unpack list
occuList2new = []
for oneOccu in occuList2:
occuList2new.append(oneOccu[0])
occuList2 = occuList2new
printOccuList(occuList2, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys)
# the same again for a third iteration
# third processing
print("Status: Third processing started")
gedcomNamesListClear3 = []
# iterate all original occupation information
for i in occuList2:
# iterate the five possible keys ("occupation 1", ...)
for key in occuKeys:
# if the entry for the key does not contain any content, skip it
try:
if i[key] == {}:
continue;
# only professions that are "not found"
if i[key]["selection info"] == "not found":
gedcomNamesListClear3.append(i[key]["occupation"])
except: # if it is still in a list with only one value
if i[0][key] == {}:
continue
# only professions that are "not found"
if i[0][key]["selection info"] == "not found":
gedcomNamesListClear3.append(i[0][key]["occupation"])
# parallelization
pool = Pool(1) # number of cores used is variable
occuList3 = pool.map(partial(createOccuList,
existingVariantsKldB=existingVariantsKldB,
dubletCounterDict=dubletCounterDict,
filename="third try"), gedcomNamesListClear3)
pool.close()
pool.join()
# unpack list
occuList3new = []
for oneOccu in occuList3:
occuList3new.append(oneOccu[0])
occuList3 = occuList3new
printOccuList(occuList3, existingVariantsKldB, fieldnamesVariants, fieldnamesDistance, fieldnamesOccu,
filenameVariants, filenameDistance, filenameOccu, occuKeys)
# creation of statistics for the three iterations
statistics(occuList, occuKeys)
statistics(occuList2, occuKeys)
statistics(occuList3, occuKeys)
# storage of the time at the ending of the program run
finishtime = time.perf_counter()
# status info
print("Status: Program finished in", round(finishtime - starttime, 2), "seconds(s)")
# Ignore the following warning: InsecureRequestWarning: Unverified HTTPS request is being made to host 'gedbas.genealogy.net'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Scraper based on introduction on https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460
# published from Julia Kho on 27 September 2019, last open on 2 July 2021
# import of libraries
import requests
from bs4 import BeautifulSoup
import io
# initialisation of counters
number = 0
found = 0
notAllowed = 0
notFound = 0
notFoundSeries = 0
end = 10000
empty = 0
# possible GEDCOM files were determined by varying the URL
# the end of the URL is changed with a number that is incremented
# execution of the iteration until no result is found 10000 times (value of variable "end")
while notFoundSeries != end:
# definition of the URL
url = "https://gedbas.genealogy.net/gedcom/export/" + str(number)
try:
# scraping the information
response = requests.get(url, verify=False)
gedcom = str(BeautifulSoup(response.text, "html.parser"))
# analysis of the information
# with "not allowed" no public access to the file is possible
# with "not found", the file is not (no longer) available
# in all other cases the information is written to a GEDCOM file
if gedcom != "not found" and gedcom != "not allowed" and gedcom != "":
filename = str(number) + ".ged"
file = io.open(filename, "w", encoding="utf-8")
file.write(gedcom)
file.close()
# count number of detected files
found = found + 1
# resetting the counter that counts the number of unsuccessful calls in series
notFoundSeries = 0
# count number of not allowed files
if gedcom == "not allowed":
notAllowed = notAllowed + 1
notFoundSeries = 0
# count number of not more existing files
if gedcom == "not found":
notFound = notFound + 1
notFoundSeries = notFoundSeries + 1
# count number of empty gedcom files
if gedcom == "":
empty = empty + 1
notFoundSeries = 0
except:
print("Status: There is an error in file " + str(number) + ".")
# gives info every 1000 urls
if number % 1000 == 0:
print("Status:", str(number) + " urls were analyzed")
# count up per analysed URL
number = number + 1
# printing status information
print("Status: Scraping finished")
print("Status: " + str(found) + " files could be found")
print("Status: Access was denied for " + str(notAllowed) + " files")
print("Status: " + str(notFound - end) + " files were deleted")
print("Status: " + str(empty) + " files were blank")
This diff is collapsed.
readme
Die folgende Anleitung soll eine Benutzung des Python-Skripts und eine Interpretation der Ergebnisse ermöglichen.
Bibliotheken:
Damit das Programm ausgeführt werden kann sind ggf. noch weitere Bibliotheken lokal zu installieren. In den ersten Zeilen des Skripts sind die benutzten Bibliotheken angegeben.
Eingangsdateien mit Berufsangaben:
Das Programm ist darauf ausgelegt, zwei verschiedene Arten von Eingangsdateien zu bearbeiten: (1.) CSV-Dateien und (2.) GEDCOM-Dateien. Je nachdem welche Art vorliegt ist im Programm der Parameter „typeOfData“ im Programmcode auf „csv“ oder „ged“ zu setzen.
Liegen die Berufsangaben in einer CSV-Datei vor, so ist diese so zu strukturieren, dass sie eine Spalte enthält, in dessen erster Zeile die Überschrift „occupation“ steht. In den folgenden Zeilen folgen jeweils die zu lemmatisierten Berufsangaben. Am Ablageort des Skripts muss auch Ordner „data“ existieren, in dem die Datei ablegt ist. Sie trägt die Bezeichnung „occupations.csv“.
Falls die Berufsangaben in GEDCOM-Dateien vorliegen, so sind die GEDCOM-Dateien mit fortlaufenden Ziffern zu benennen („1.ged“, „2.ged“ etc.). Ziffern dürfen nicht doppelt genutzt werden. Auch diese Dateien werden im Unterordner „data“ platziert.
Variantenliste:
Wie die Eingangsdatei mit den neuen Berufsangaben wird auch die CSV-Datei mit den bestehenden Varianten dem Unterordner „data“ hinzugefügt.
Die Bezeichnung der Datei muss "variants.csv" sein. Sie enthält drei Spalten, die die Überschriften „variant“ und „OhdAB_01“ tragen. In der ersten Spalte steht die textuelle Bezeichnung und in der zweiten der zugeordnete OhdAb-Cod. Falls ein anderes Klassifizierungssysteme angewendet wird, kann in der dritten Spalte auch jede beliebige Codierung genutzt werden – die Überschrift sollte dennoch nicht verändert werden.
Parallelisierung:
Die Verarbeitung von GEDCOM-Dateien läuft parallel ab, um die Geschwindigkeit zu erhöhen. Hierzu kann festgelegt werden, wie viele Rechnerkerne genutzt werden. Dazu ist der Parameter der Funktion „Pool()“ jeweils zu verändern. Bleibt er leer, so werden alle verfügbare Rechenkerne genutzt. Im Skript ist die Anzahl der Kerne standardmäßig auf einen Kern festgelegt.
Halbierung der Varianten:
Um die Halbierung der Varianten zu erreichen ist die Variable halving auf "yes" zu setzen. Diese Möglichkeit dient vorwiegend zu Testzwecken.
Weitere Iterationen mit den neuen Varianten:
Unter Hinzuziehung der neu lemmatisierten Berufsvarianten ist es wahlweise möglich weitere Berufsvarianten zu bearbeiten, zu der es in der ursprünglichen Variantenliste keinen Treffer gibt. Im Standard sind drei Iterationen angelegt. Sollen diese nicht stattfinden, sind die Code-Bestandsteile hinter dem Kommentar „second processing“ bzw. „third processing“ auszuklammern.
Ausgabedateien:
Als Ergebnis wird die Datei „occuResult.csv“ erzeugt und während des Programmdurchlaufs fortlaufend aufgebaut. Die jeweiligen Spalten sind mit Tabstopps voneinander separiert. Diese enthält die Bezeichnung der überprüften Variante („variant“), bei GEDCOM-Dateien den Namen der Datei, in der diese vorkam („source“), die Anzahl der Häufigkeit dieser Variante in dieser Quelle („number“). In den weiteren Spalten finden sich für die einzelnen Berufsangaben (occupation1-occupation5) in der Bezeichnung verschiedene Informationen. Es werden maximal fünf einzelne Berufe identifiziert (z. B. aus der Angabe „Häusler und Fleischer und Gastwirt und Richter und Schenker“). Die Informationen sind wie folgend gegliedert:
Schema: Bezeichnung - Erläuterung
occupation - bereinigte Berufsbezeichnung
KldB 2010 - OhdAB-Code, falls eine Zuordnung geschehen kann
best fit lemma - Bezeichnung der am besten passenden Variante
row of best fit lemma - Zeile der am besten passenden Variante in der Variantenliste
titel - aus der ursprünglichen Berufsangabe rausgefilterte Titularangabe
role - aus der ursprünglichen Berufsangabe rausgefilterte Rollenangabe
year - aus der ursprünglichen Berufsangabe rausgefilterte Jahresangabe
url - aus der ursprünglichen Berufsangabe rausgefilterte Angabe einer URL
location - aus der ursprünglichen Berufsangabe rausgefilterte Angabe eines Ortes
further info - aus der ursprünglichen Berufsangabe rausgefilterten sonstigen Angaben (waren in Klammern vorhanden)
selection info - Information, ob die Berufsangabe einer bestehenden Variante zugeordnet werden konnte („found direct“, „not found“, „found after levenshtein“, „no occupational designation“)
absolute distance - absolute Levenshtein-Distanz zur am besten passenden Variante
relative distance - relative Levenshtein-Distanz zur am besten passenden Variante
Zudem wird im Unterordner „data“ eine Datei „newVariants.csv“ erzeugt, die die Bezeichnung der neuen Variante („variant“), die Bezeichnung der bereits bestehenden Variante („lemma“) sowie den Code der OhdAB („OhdAB_01“) enthält.
Jan Michael Goldberg, 22. Februar 2022
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment