diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..6f26d448a148d6bca6cde05ad7cc58b123f49a52 --- /dev/null +++ b/main.py @@ -0,0 +1,291 @@ +import qualitychecker +import placefinder +import provincefinder +import csv +import os.path +from multiprocessing import Pool, current_process +from functools import partial +import time +from zeep import Client +import json + + +def importMiniGOV(): + """ + This function loads the Mini-GOV that is located in the data folder. + This is used to assign a location to a standard notation. + :return: list of Mini-GOV entries + """ + # Information from http://wiki-de.genealogy.net/GOV/Mini-GOV, 23.03.2020 + # german language designation, because those in the Mini-GOV are also in German language + miniGOVHeaders = ["GOV-Kennung", "Objekttyp als Text", "Objekttyp als Zahl", "aktueller Name", + "letzter deutscher Name", "Staat", "adm. Zuordnung 1", "adm. Zuordnung 2", "adm. Zuordnung 3", + "adm. Zuordnung 4", "Postleitzahl", "geographische Breite", "geographische Länge"] + + # embedding the Mini-GOVs of different countries (Germany, Poland, Austria, Switzerland, Czech Republic, Denmark, France, Netherlands) + miniGOVFiles = ["gov-data_D_20190325_201241.txt", "gov-data_PL_20190325_201241.txt", + "gov-data_A_20190325_201241.txt", "gov-data_CH.txt", "gov-data_CZ_20190325_201241.txt", + "gov-data_DK.txt", "gov-data_F_20190325_201241.txt", "gov-data_NL.txt"] + + miniGOV = [] # initialize list with entries of Mini-GOV + # for each named Mini-GOV file the data is loaded and merged into miniGOV + for i in miniGOVFiles: + filepath = os.path.join("data", i) + with open(filepath, encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t", fieldnames=miniGOVHeaders) + miniGOV = miniGOV + list(reader) + csvfile.close() + + # expand of miniGOV because some Mini-GOVs contain "previous german names" + # for each former German name an entry is also created so that it can be found + expandMiniGOV = [] # initialize + for i in miniGOV: + if i["letzter deutscher Name"] != "": + i["aktueller Name"] = i["letzter deutscher Name"] # overwrite the current name with the last german name + else: + continue; # do not append + expandMiniGOV.append(i) + + # merge miniGOV and expandMiniGOV + miniGOV = miniGOV + expandMiniGOV + + # alphabetical sorting of miniGOV in relation to the column with the name of the place + # all designations are written in lower case + # .lower() is extremely important here, because otherwise capital letters are preferred over small ones and such cases occur in the GOV (e.g. some places starting with IJ, IJselstein) + miniGOV = (sorted(miniGOV, key=lambda x: (x["aktueller Name"].lower()))) + return (miniGOV) + + +def loadData(filename, delimiter, encoding): + """ + This function opens files in which data is temporarily stored and was created by the program in a previous run. + :param filenname: name of the file + :param delimiter: string of delimiter + :return: content of the file as a list of lists; if no file exists an empty list + """ + emptyList = [] # list oft dicts + try: + with open(filename, "r", encoding=encoding) as data: + for i in csv.DictReader(data, delimiter=delimiter): + i = json.loads(json.dumps(i)) + emptyList.append(i) + except FileNotFoundError: + print("Status: Initial run, do not create a list yet:", filename) + return (emptyList) + + +def appendFile(filename, data, fieldnames, moreThanOneRow): + """ + This function adds a line to an existing file. + :param filename: name of the file + :param data: content of the line to be added + :param fieldnames: column headers of the file + :param moreThanOneRow: if it equals 0 so one row are printed or equals 1 more row are printed (integer) + """ + openQualityChecker = open(filename, "a", newline="", encoding="utf-8") + writerQualityChecker = csv.DictWriter(openQualityChecker, fieldnames=fieldnames, delimiter="\t") + # distinction between adding data to "quality.csv" file and other files + # with "quality.csv" only one row is added, with all others several rows + if moreThanOneRow == 0: + writerQualityChecker.writerow(data) + else: + writerQualityChecker.writerows(data) + openQualityChecker.close() + + +def createFile(filename, fieldnames, delimiter, encoding): + """ + This function is used to create files if they do not yet exist. + But if they already exist, the existing content will be loaded. + :param filename: name of the file + :param fieldnames: column headers of the file + :return: loaded data; if there is no data, an empty list is returned + """ + # load existing content + loadedData = loadData(filename, delimiter, encoding) + # create a new file if it is not there + if len(loadedData) == 0: # only if loadedData is an empty list a new file is created + openQualityChecker = open(filename, "w", newline="", encoding="utf-8") + writerQualityChecker = csv.writer(openQualityChecker, delimiter=delimiter) + writerQualityChecker.writerow(fieldnames) + openQualityChecker.close() + return (loadedData) + + +def loadGedcomFile(datename): + """ + This function loads the data from a single GEDCOM file. + If the sources are not in GEDCOM format, this area must be adjusted. + :param datename: name of source (here GEDCOM file) + :return: list containing one entry per line of a GEDCOM file; if the file cannot be found "NONE" is returned + """ + filepath = os.path.join("data", datename) + line = [] # initialize empty list + try: + gedcom = open(filepath, "r", encoding="utf-8") + data = gedcom.readline() + # delete the last character of each line, which is a space + data = data[:-1] + # the last line is empty, so the lines are processed until this empty line appears + while data != "": + data = str(gedcom.readline()) + data = data[:-1] + line.append(data) + gedcom.close() + return (line) + except FileNotFoundError: + print("Error: Problem with access to file", datename, ".") + return ("NONE") + + +def parallel(filename, miniGovList, qualityDict, fieldnamesStep1, fieldnamesStep2, fieldnamesStep3, filenameStep1, + filenameStep2, filenameStep3): + """ + This function is called once per source (here GEDCOM file). + The process consists of three steps. + First, a metadata analysis is performed, the result of which can be found in the file "quality.csv". + Then the urban names are subjected to identification. + In the third step, regional clustering is performed at a defined time. + The goal is to extend the files "quality.csv", "placefinder.csv" and "provincesdict.csv". + :param filename: name of the file/source + :param miniGovList: list of merged entries of the Mini-GOV + :param qualityDict: Metadata about the data from previous program runs + :param fieldnamesStep1: name of the columns of the file "quality.csv" + :param fieldnamesStep2: name of the columns of the file "placefinder.csv" + :param fieldnamesStep3: name of the columns of the file "provincesdict.csv" + :param filenameStep1: string of the file name "quality.csv" + :param filenameStep2: string of the file name "placefinder.csv" + :param filenameStep3: string of the file name "provincesdict.csv" + """ + # a loop with one iteration is used here to formulate a termination condition + for i in range(1): + # note the number of the parallelization process + spawnPoolWorker = current_process().name + + # load data of a GEDCOM file + # must be changed if source is not a GEDCOM file + data = loadGedcomFile(filename) + + # Step 1: Metadata/Quality analysis + print(spawnPoolWorker, "Status: Metadata analysis of", filename, "begins.") + resultQualityChecker = qualitychecker.mainMetadataInspector(data, filename, miniGovList, qualityDict) + if resultQualityChecker == "StartingExitStrategy": + print(spawnPoolWorker, "Status: The data to file", filename, "is complete.") + continue # check next file + + # Step 2: Identification + print(spawnPoolWorker, "Status: Identifying the places of", filename, "begins.") + resultPlaceFinder = placefinder.mainPlaceFinder(data, resultQualityChecker, filename, miniGovList) + + # Step 3: Clustering + print(spawnPoolWorker, "Status: Clustering of the places of", filename, "begins.") + # definition of a year at which the administrative clustering should take place + referencetime = 1800 + # a working internet connection is necessary + client = Client("https://gov.genealogy.net/services/ComplexService?wsdl") + resultProvinceFinder = provincefinder.mainProvinceFinder(resultPlaceFinder, filename, client, referencetime) + + # blocked file extension + # happens because if an output is included, all files are still at the same file level + try: + appendFile(filenameStep1, resultQualityChecker, fieldnamesStep1, 0) # only one row + appendFile(filenameStep2, resultPlaceFinder, fieldnamesStep2, 1) + appendFile(filenameStep3, resultProvinceFinder, fieldnamesStep3, 1) + except: + print("Error: Blocked printing of lines failed. Manual deletion of the last entries in the files attached.") + + +if __name__ == "__main__": + """ + This construction exists to prepare the parallelization. + The section up to the comment "start of parallelization" is executed only once. + It is used to load the location data from the source (here GEDCOM files) and create CSV files initially. + """ + # memorizing the start time + starttime = time.perf_counter() + + # define range of GEDCOM data + # assume that the GEDCOM files are in 12345.ged format + begin = 0 # starts at 0.ged + end = 60000 # ends at 60000 + gedcomNamesList = [] # creation of a list with possible GEDCOM file names + while begin != end: + datename = str(begin) + ".ged" # name of GEDCOM file + gedcomNamesList.append(datename) + begin = begin + 1 + + # possibility that not all files of the gedcomNamesList exist + # do not let the non-existent files into the multiprocessing + # check the existence of the files + gedcomNamesListClear = [] # version of gedcomNamesList, which contains only existing files + for i in gedcomNamesList: + filepath = os.path.join("data", i) # GEDCOM files are located in the subfolder "data" + try: + gedcom = open(filepath, "r", encoding="utf-8") + gedcom.close() + gedcomNamesListClear.append(i) + except FileNotFoundError: + pass + + # Loading data from the Mini-GOV + miniGovList = importMiniGOV() + + # initialization of CSV files, which are needed in the further course + filenameStep1 = "quality.csv" + fieldnamesStep1 = ["filename", # name of GEDCOM file + "number of places", + "number of noHit", + "number of moreThanOneHit", + "number of definitely coordinates", + "longitude mean of of definitely coordinates", + "latitude mean of of definitely coordinates", + "number of existing clusters", + "number of relevant clusters", + "cluster midpoints" # list of lists of geographical centers of individual clusters + ] + # load already existing data into a variable + qualityDict = createFile(filenameStep1, fieldnamesStep1, "\t", "utf-8") + + # list of all Urbanoyme per source + filenameStep2 = "placefinder.csv" + fieldnamesStep2 = ["id", # GOV-ID of a place + "latitude", # latitude of the place + "longitude", # longitude of the place + "selection information", # description of the identification of this urbanonym + "adjusted name", # adjusted spelling of the urbanonym in the source + "original name", # original spelling of the urbanonym in the source + "filename" # name of the file where the urbanonym is found + ] + createFile(filenameStep2, fieldnamesStep2, "\t", "utf-8") + + # list of urban names already assigned to a province per file to avoid double searches + filenameStep3 = "provincesdict.csv" + fieldnamesStep3 = ["original name", # original spelling of the urbanonym in the source + "filename", # name of the file where the urbanonym is found + "id", # GOV-ID of a place + "province" # name of assigned administrative unit + ] + createFile(filenameStep3, fieldnamesStep3, "\t", "utf-8") + + # start of parallelization + # executes the function "parallel" per entry in the list gedcomNamesListClear (per urbanonym) + # parallelization is realized to shorten the processing time + pool = Pool() + pool.map(partial(parallel, + miniGovList=miniGovList, + qualityDict=qualityDict, + fieldnamesStep1=fieldnamesStep1, + fieldnamesStep2=fieldnamesStep2, + fieldnamesStep3=fieldnamesStep3, + filenameStep1=filenameStep1, + filenameStep2=filenameStep2, + filenameStep3=filenameStep3, ) + , gedcomNamesListClear) + pool.close() + pool.join() + + # memorizing the time of finishing + finishtime = time.perf_counter() + + # print the duration of the program run + print("Finished in", round(finishtime - starttime, 2), "seconds(s)")