Neue Datei hochladen

7a54214a · Marcus Baumgarten · 9bff0b88 · 7a54214a
Commit 7a54214a authored 2 years ago by Marcus Baumgarten
--- a/main.py
+++ b/main.py
+import qualitychecker
+import placefinder
+import provincefinder
+import csv
+import os.path
+from multiprocessing import Pool, current_process
+from functools import partial
+import time
+from zeep import Client
+import json
+
+
+def importMiniGOV():
+    """
+    This function loads the Mini-GOV that is located in the data folder.
+    This is used to assign a location to a standard notation.
+    :return: list of Mini-GOV entries
+    """
+    # Information from http://wiki-de.genealogy.net/GOV/Mini-GOV, 23.03.2020
+    # german language designation, because those in the Mini-GOV are also in German language
+    miniGOVHeaders = ["GOV-Kennung", "Objekttyp als Text", "Objekttyp als Zahl", "aktueller Name",
+                      "letzter deutscher Name", "Staat", "adm. Zuordnung 1", "adm. Zuordnung 2", "adm. Zuordnung 3",
+                      "adm. Zuordnung 4", "Postleitzahl", "geographische Breite", "geographische Länge"]
+
+    # embedding the Mini-GOVs of different countries (Germany, Poland, Austria, Switzerland, Czech Republic, Denmark, France, Netherlands)
+    miniGOVFiles = ["gov-data_D_20190325_201241.txt", "gov-data_PL_20190325_201241.txt",
+                    "gov-data_A_20190325_201241.txt", "gov-data_CH.txt", "gov-data_CZ_20190325_201241.txt",
+                    "gov-data_DK.txt", "gov-data_F_20190325_201241.txt", "gov-data_NL.txt"]
+
+    miniGOV = []  # initialize list with entries of Mini-GOV
+    # for each named Mini-GOV file the data is loaded and merged into miniGOV
+    for i in miniGOVFiles:
+        filepath = os.path.join("data", i)
+        with open(filepath, encoding="utf-8") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter="\t", fieldnames=miniGOVHeaders)
+            miniGOV = miniGOV + list(reader)
+            csvfile.close()
+
+    # expand of miniGOV because some Mini-GOVs contain "previous german names"
+    # for each former German name an entry is also created so that it can be found
+    expandMiniGOV = []  # initialize
+    for i in miniGOV:
+        if i["letzter deutscher Name"] != "":
+            i["aktueller Name"] = i["letzter deutscher Name"]  # overwrite the current name with the last german name
+        else:
+            continue;  # do not append
+        expandMiniGOV.append(i)
+
+    # merge miniGOV and expandMiniGOV
+    miniGOV = miniGOV + expandMiniGOV
+
+    # alphabetical sorting of miniGOV in relation to the column with the name of the place
+    # all designations are written in lower case
+    # .lower() is extremely important here, because otherwise capital letters are preferred over small ones and such cases occur in the GOV (e.g. some places starting with IJ, IJselstein)
+    miniGOV = (sorted(miniGOV, key=lambda x: (x["aktueller Name"].lower())))
+    return (miniGOV)
+
+
+def loadData(filename, delimiter, encoding):
+    """
+    This function opens files in which data is temporarily stored and was created by the program in a previous run.
+    :param filenname: name of the file
+    :param delimiter: string of delimiter
+    :return: content of the file as a list of lists; if no file exists an empty list
+    """
+    emptyList = []  # list oft dicts
+    try:
+        with open(filename, "r", encoding=encoding) as data:
+            for i in csv.DictReader(data, delimiter=delimiter):
+                i = json.loads(json.dumps(i))
+                emptyList.append(i)
+    except FileNotFoundError:
+        print("Status: Initial run, do not create a list yet:", filename)
+    return (emptyList)
+
+
+def appendFile(filename, data, fieldnames, moreThanOneRow):
+    """
+    This function adds a line to an existing file.
+    :param filename: name of the file
+    :param data: content of the line to be added
+    :param fieldnames: column headers of the file
+    :param moreThanOneRow: if it equals 0 so one row are printed or equals 1 more row are printed (integer)
+    """
+    openQualityChecker = open(filename, "a", newline="", encoding="utf-8")
+    writerQualityChecker = csv.DictWriter(openQualityChecker, fieldnames=fieldnames, delimiter="\t")
+    # distinction between adding data to "quality.csv" file and other files
+    # with "quality.csv" only one row is added, with all others several rows
+    if moreThanOneRow == 0:
+        writerQualityChecker.writerow(data)
+    else:
+        writerQualityChecker.writerows(data)
+    openQualityChecker.close()
+
+
+def createFile(filename, fieldnames, delimiter, encoding):
+    """
+    This function is used to create files if they do not yet exist.
+    But if they already exist, the existing content will be loaded.
+    :param filename: name of the file
+    :param fieldnames: column headers of the file
+    :return: loaded data; if there is no data, an empty list is returned
+    """
+    # load existing content
+    loadedData = loadData(filename, delimiter, encoding)
+    # create a new file if it is not there
+    if len(loadedData) == 0:  # only if loadedData is an empty list a new file is created
+        openQualityChecker = open(filename, "w", newline="", encoding="utf-8")
+        writerQualityChecker = csv.writer(openQualityChecker, delimiter=delimiter)
+        writerQualityChecker.writerow(fieldnames)
+        openQualityChecker.close()
+    return (loadedData)
+
+
+def loadGedcomFile(datename):
+    """
+    This function loads the data from a single GEDCOM file.
+    If the sources are not in GEDCOM format, this area must be adjusted.
+    :param datename: name of source (here GEDCOM file)
+    :return: list containing one entry per line of a GEDCOM file; if the file cannot be found "NONE" is returned
+    """
+    filepath = os.path.join("data", datename)
+    line = []  # initialize empty list
+    try:
+        gedcom = open(filepath, "r", encoding="utf-8")
+        data = gedcom.readline()
+        # delete the last character of each line, which is a space
+        data = data[:-1]
+        # the last line is empty, so the lines are processed until this empty line appears
+        while data != "":
+            data = str(gedcom.readline())
+            data = data[:-1]
+            line.append(data)
+        gedcom.close()
+        return (line)
+    except FileNotFoundError:
+        print("Error: Problem with access to file", datename, ".")
+        return ("NONE")
+
+
+def parallel(filename, miniGovList, qualityDict, fieldnamesStep1, fieldnamesStep2, fieldnamesStep3, filenameStep1,
+             filenameStep2, filenameStep3):
+    """
+    This function is called once per source (here GEDCOM file).
+    The process consists of three steps.
+    First, a metadata analysis is performed, the result of which can be found in the file "quality.csv".
+    Then the urban names are subjected to identification.
+    In the third step, regional clustering is performed at a defined time.
+    The goal is to extend the files "quality.csv", "placefinder.csv" and "provincesdict.csv".
+    :param filename: name of the file/source
+    :param miniGovList: list of merged entries of the Mini-GOV
+    :param qualityDict: Metadata about the data from previous program runs
+    :param fieldnamesStep1: name of the columns of the file "quality.csv"
+    :param fieldnamesStep2: name of the columns of the file "placefinder.csv"
+    :param fieldnamesStep3: name of the columns of the file "provincesdict.csv"
+    :param filenameStep1: string of the file name "quality.csv"
+    :param filenameStep2: string of the file name "placefinder.csv"
+    :param filenameStep3: string of the file name "provincesdict.csv"
+    """
+    # a loop with one iteration is used here to formulate a termination condition
+    for i in range(1):
+        # note the number of the parallelization process
+        spawnPoolWorker = current_process().name
+
+        # load data of a GEDCOM file
+        # must be changed if source is not a GEDCOM file
+        data = loadGedcomFile(filename)
+
+        # Step 1: Metadata/Quality analysis
+        print(spawnPoolWorker, "Status: Metadata analysis of", filename, "begins.")
+        resultQualityChecker = qualitychecker.mainMetadataInspector(data, filename, miniGovList, qualityDict)
+        if resultQualityChecker == "StartingExitStrategy":
+            print(spawnPoolWorker, "Status: The data to file", filename, "is complete.")
+            continue  # check next file
+
+        # Step 2: Identification
+        print(spawnPoolWorker, "Status: Identifying the places of", filename, "begins.")
+        resultPlaceFinder = placefinder.mainPlaceFinder(data, resultQualityChecker, filename, miniGovList)
+
+        # Step 3: Clustering
+        print(spawnPoolWorker, "Status: Clustering of the places of", filename, "begins.")
+        # definition of a year at which the administrative clustering should take place
+        referencetime = 1800
+        # a working internet connection is necessary
+        client = Client("https://gov.genealogy.net/services/ComplexService?wsdl")
+        resultProvinceFinder = provincefinder.mainProvinceFinder(resultPlaceFinder, filename, client, referencetime)
+
+        # blocked file extension
+        # happens because if an output is included, all files are still at the same file level
+        try:
+            appendFile(filenameStep1, resultQualityChecker, fieldnamesStep1, 0)  # only one row
+            appendFile(filenameStep2, resultPlaceFinder, fieldnamesStep2, 1)
+            appendFile(filenameStep3, resultProvinceFinder, fieldnamesStep3, 1)
+        except:
+            print("Error: Blocked printing of lines failed. Manual deletion of the last entries in the files attached.")
+
+
+if __name__ == "__main__":
+    """
+    This construction exists to prepare the parallelization.
+    The section up to the comment "start of parallelization" is executed only once. 
+    It is used to load the location data from the source (here GEDCOM files) and create CSV files initially.
+    """
+    # memorizing the start time
+    starttime = time.perf_counter()
+
+    # define range of GEDCOM data
+    # assume that the GEDCOM files are in 12345.ged format
+    begin = 0  # starts at 0.ged
+    end = 60000  # ends at 60000
+    gedcomNamesList = []  # creation of a list with possible GEDCOM file names
+    while begin != end:
+        datename = str(begin) + ".ged"  # name of GEDCOM file
+        gedcomNamesList.append(datename)
+        begin = begin + 1
+
+    # possibility that not all files of the gedcomNamesList exist
+    # do not let the non-existent files into the multiprocessing
+    # check the existence of the files
+    gedcomNamesListClear = []  # version of gedcomNamesList, which contains only existing files
+    for i in gedcomNamesList:
+        filepath = os.path.join("data", i)  # GEDCOM files are located in the subfolder "data"
+        try:
+            gedcom = open(filepath, "r", encoding="utf-8")
+            gedcom.close()
+            gedcomNamesListClear.append(i)
+        except FileNotFoundError:
+            pass
+
+    # Loading data from the Mini-GOV
+    miniGovList = importMiniGOV()
+
+    # initialization of CSV files, which are needed in the further course
+    filenameStep1 = "quality.csv"
+    fieldnamesStep1 = ["filename",  # name of GEDCOM file
+                       "number of places",
+                       "number of noHit",
+                       "number of moreThanOneHit",
+                       "number of definitely coordinates",
+                       "longitude mean of of definitely coordinates",
+                       "latitude mean of of definitely coordinates",
+                       "number of existing clusters",
+                       "number of relevant clusters",
+                       "cluster midpoints"  # list of lists of geographical centers of individual clusters
+                       ]
+    # load already existing data into a variable
+    qualityDict = createFile(filenameStep1, fieldnamesStep1, "\t", "utf-8")
+
+    # list of all Urbanoyme per source
+    filenameStep2 = "placefinder.csv"
+    fieldnamesStep2 = ["id",  # GOV-ID of a place
+                       "latitude",  # latitude of the place
+                       "longitude",  # longitude of the place
+                       "selection information",  # description of the identification of this urbanonym
+                       "adjusted name",  # adjusted spelling of the urbanonym in the source
+                       "original name",  # original spelling of the urbanonym in the source
+                       "filename"  # name of the file where the urbanonym is found
+                       ]
+    createFile(filenameStep2, fieldnamesStep2, "\t", "utf-8")
+
+    # list of urban names already assigned to a province per file to avoid double searches
+    filenameStep3 = "provincesdict.csv"
+    fieldnamesStep3 = ["original name",  # original spelling of the urbanonym in the source
+                       "filename",  # name of the file where the urbanonym is found
+                       "id",  # GOV-ID of a place
+                       "province"  # name of assigned administrative unit
+                       ]
+    createFile(filenameStep3, fieldnamesStep3, "\t", "utf-8")
+
+    # start of parallelization
+    # executes the function "parallel" per entry in the list gedcomNamesListClear (per urbanonym)
+    # parallelization is realized to shorten the processing time
+    pool = Pool()
+    pool.map(partial(parallel,
+                     miniGovList=miniGovList,
+                     qualityDict=qualityDict,
+                     fieldnamesStep1=fieldnamesStep1,
+                     fieldnamesStep2=fieldnamesStep2,
+                     fieldnamesStep3=fieldnamesStep3,
+                     filenameStep1=filenameStep1,
+                     filenameStep2=filenameStep2,
+                     filenameStep3=filenameStep3, )
+             , gedcomNamesListClear)
+    pool.close()
+    pool.join()
+
+    # memorizing the time of finishing
+    finishtime = time.perf_counter()
+
+    # print the duration of the program run
+    print("Finished in", round(finishtime - starttime, 2), "seconds(s)")