Skip to content
Snippets Groups Projects
Commit 7a54214a authored by Marcus Baumgarten's avatar Marcus Baumgarten
Browse files

Neue Datei hochladen

parent 9bff0b88
No related branches found
No related tags found
No related merge requests found
main.py 0 → 100644
import qualitychecker
import placefinder
import provincefinder
import csv
import os.path
from multiprocessing import Pool, current_process
from functools import partial
import time
from zeep import Client
import json
def importMiniGOV():
"""
This function loads the Mini-GOV that is located in the data folder.
This is used to assign a location to a standard notation.
:return: list of Mini-GOV entries
"""
# Information from http://wiki-de.genealogy.net/GOV/Mini-GOV, 23.03.2020
# german language designation, because those in the Mini-GOV are also in German language
miniGOVHeaders = ["GOV-Kennung", "Objekttyp als Text", "Objekttyp als Zahl", "aktueller Name",
"letzter deutscher Name", "Staat", "adm. Zuordnung 1", "adm. Zuordnung 2", "adm. Zuordnung 3",
"adm. Zuordnung 4", "Postleitzahl", "geographische Breite", "geographische Länge"]
# embedding the Mini-GOVs of different countries (Germany, Poland, Austria, Switzerland, Czech Republic, Denmark, France, Netherlands)
miniGOVFiles = ["gov-data_D_20190325_201241.txt", "gov-data_PL_20190325_201241.txt",
"gov-data_A_20190325_201241.txt", "gov-data_CH.txt", "gov-data_CZ_20190325_201241.txt",
"gov-data_DK.txt", "gov-data_F_20190325_201241.txt", "gov-data_NL.txt"]
miniGOV = [] # initialize list with entries of Mini-GOV
# for each named Mini-GOV file the data is loaded and merged into miniGOV
for i in miniGOVFiles:
filepath = os.path.join("data", i)
with open(filepath, encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile, delimiter="\t", fieldnames=miniGOVHeaders)
miniGOV = miniGOV + list(reader)
csvfile.close()
# expand of miniGOV because some Mini-GOVs contain "previous german names"
# for each former German name an entry is also created so that it can be found
expandMiniGOV = [] # initialize
for i in miniGOV:
if i["letzter deutscher Name"] != "":
i["aktueller Name"] = i["letzter deutscher Name"] # overwrite the current name with the last german name
else:
continue; # do not append
expandMiniGOV.append(i)
# merge miniGOV and expandMiniGOV
miniGOV = miniGOV + expandMiniGOV
# alphabetical sorting of miniGOV in relation to the column with the name of the place
# all designations are written in lower case
# .lower() is extremely important here, because otherwise capital letters are preferred over small ones and such cases occur in the GOV (e.g. some places starting with IJ, IJselstein)
miniGOV = (sorted(miniGOV, key=lambda x: (x["aktueller Name"].lower())))
return (miniGOV)
def loadData(filename, delimiter, encoding):
"""
This function opens files in which data is temporarily stored and was created by the program in a previous run.
:param filenname: name of the file
:param delimiter: string of delimiter
:return: content of the file as a list of lists; if no file exists an empty list
"""
emptyList = [] # list oft dicts
try:
with open(filename, "r", encoding=encoding) as data:
for i in csv.DictReader(data, delimiter=delimiter):
i = json.loads(json.dumps(i))
emptyList.append(i)
except FileNotFoundError:
print("Status: Initial run, do not create a list yet:", filename)
return (emptyList)
def appendFile(filename, data, fieldnames, moreThanOneRow):
"""
This function adds a line to an existing file.
:param filename: name of the file
:param data: content of the line to be added
:param fieldnames: column headers of the file
:param moreThanOneRow: if it equals 0 so one row are printed or equals 1 more row are printed (integer)
"""
openQualityChecker = open(filename, "a", newline="", encoding="utf-8")
writerQualityChecker = csv.DictWriter(openQualityChecker, fieldnames=fieldnames, delimiter="\t")
# distinction between adding data to "quality.csv" file and other files
# with "quality.csv" only one row is added, with all others several rows
if moreThanOneRow == 0:
writerQualityChecker.writerow(data)
else:
writerQualityChecker.writerows(data)
openQualityChecker.close()
def createFile(filename, fieldnames, delimiter, encoding):
"""
This function is used to create files if they do not yet exist.
But if they already exist, the existing content will be loaded.
:param filename: name of the file
:param fieldnames: column headers of the file
:return: loaded data; if there is no data, an empty list is returned
"""
# load existing content
loadedData = loadData(filename, delimiter, encoding)
# create a new file if it is not there
if len(loadedData) == 0: # only if loadedData is an empty list a new file is created
openQualityChecker = open(filename, "w", newline="", encoding="utf-8")
writerQualityChecker = csv.writer(openQualityChecker, delimiter=delimiter)
writerQualityChecker.writerow(fieldnames)
openQualityChecker.close()
return (loadedData)
def loadGedcomFile(datename):
"""
This function loads the data from a single GEDCOM file.
If the sources are not in GEDCOM format, this area must be adjusted.
:param datename: name of source (here GEDCOM file)
:return: list containing one entry per line of a GEDCOM file; if the file cannot be found "NONE" is returned
"""
filepath = os.path.join("data", datename)
line = [] # initialize empty list
try:
gedcom = open(filepath, "r", encoding="utf-8")
data = gedcom.readline()
# delete the last character of each line, which is a space
data = data[:-1]
# the last line is empty, so the lines are processed until this empty line appears
while data != "":
data = str(gedcom.readline())
data = data[:-1]
line.append(data)
gedcom.close()
return (line)
except FileNotFoundError:
print("Error: Problem with access to file", datename, ".")
return ("NONE")
def parallel(filename, miniGovList, qualityDict, fieldnamesStep1, fieldnamesStep2, fieldnamesStep3, filenameStep1,
filenameStep2, filenameStep3):
"""
This function is called once per source (here GEDCOM file).
The process consists of three steps.
First, a metadata analysis is performed, the result of which can be found in the file "quality.csv".
Then the urban names are subjected to identification.
In the third step, regional clustering is performed at a defined time.
The goal is to extend the files "quality.csv", "placefinder.csv" and "provincesdict.csv".
:param filename: name of the file/source
:param miniGovList: list of merged entries of the Mini-GOV
:param qualityDict: Metadata about the data from previous program runs
:param fieldnamesStep1: name of the columns of the file "quality.csv"
:param fieldnamesStep2: name of the columns of the file "placefinder.csv"
:param fieldnamesStep3: name of the columns of the file "provincesdict.csv"
:param filenameStep1: string of the file name "quality.csv"
:param filenameStep2: string of the file name "placefinder.csv"
:param filenameStep3: string of the file name "provincesdict.csv"
"""
# a loop with one iteration is used here to formulate a termination condition
for i in range(1):
# note the number of the parallelization process
spawnPoolWorker = current_process().name
# load data of a GEDCOM file
# must be changed if source is not a GEDCOM file
data = loadGedcomFile(filename)
# Step 1: Metadata/Quality analysis
print(spawnPoolWorker, "Status: Metadata analysis of", filename, "begins.")
resultQualityChecker = qualitychecker.mainMetadataInspector(data, filename, miniGovList, qualityDict)
if resultQualityChecker == "StartingExitStrategy":
print(spawnPoolWorker, "Status: The data to file", filename, "is complete.")
continue # check next file
# Step 2: Identification
print(spawnPoolWorker, "Status: Identifying the places of", filename, "begins.")
resultPlaceFinder = placefinder.mainPlaceFinder(data, resultQualityChecker, filename, miniGovList)
# Step 3: Clustering
print(spawnPoolWorker, "Status: Clustering of the places of", filename, "begins.")
# definition of a year at which the administrative clustering should take place
referencetime = 1800
# a working internet connection is necessary
client = Client("https://gov.genealogy.net/services/ComplexService?wsdl")
resultProvinceFinder = provincefinder.mainProvinceFinder(resultPlaceFinder, filename, client, referencetime)
# blocked file extension
# happens because if an output is included, all files are still at the same file level
try:
appendFile(filenameStep1, resultQualityChecker, fieldnamesStep1, 0) # only one row
appendFile(filenameStep2, resultPlaceFinder, fieldnamesStep2, 1)
appendFile(filenameStep3, resultProvinceFinder, fieldnamesStep3, 1)
except:
print("Error: Blocked printing of lines failed. Manual deletion of the last entries in the files attached.")
if __name__ == "__main__":
"""
This construction exists to prepare the parallelization.
The section up to the comment "start of parallelization" is executed only once.
It is used to load the location data from the source (here GEDCOM files) and create CSV files initially.
"""
# memorizing the start time
starttime = time.perf_counter()
# define range of GEDCOM data
# assume that the GEDCOM files are in 12345.ged format
begin = 0 # starts at 0.ged
end = 60000 # ends at 60000
gedcomNamesList = [] # creation of a list with possible GEDCOM file names
while begin != end:
datename = str(begin) + ".ged" # name of GEDCOM file
gedcomNamesList.append(datename)
begin = begin + 1
# possibility that not all files of the gedcomNamesList exist
# do not let the non-existent files into the multiprocessing
# check the existence of the files
gedcomNamesListClear = [] # version of gedcomNamesList, which contains only existing files
for i in gedcomNamesList:
filepath = os.path.join("data", i) # GEDCOM files are located in the subfolder "data"
try:
gedcom = open(filepath, "r", encoding="utf-8")
gedcom.close()
gedcomNamesListClear.append(i)
except FileNotFoundError:
pass
# Loading data from the Mini-GOV
miniGovList = importMiniGOV()
# initialization of CSV files, which are needed in the further course
filenameStep1 = "quality.csv"
fieldnamesStep1 = ["filename", # name of GEDCOM file
"number of places",
"number of noHit",
"number of moreThanOneHit",
"number of definitely coordinates",
"longitude mean of of definitely coordinates",
"latitude mean of of definitely coordinates",
"number of existing clusters",
"number of relevant clusters",
"cluster midpoints" # list of lists of geographical centers of individual clusters
]
# load already existing data into a variable
qualityDict = createFile(filenameStep1, fieldnamesStep1, "\t", "utf-8")
# list of all Urbanoyme per source
filenameStep2 = "placefinder.csv"
fieldnamesStep2 = ["id", # GOV-ID of a place
"latitude", # latitude of the place
"longitude", # longitude of the place
"selection information", # description of the identification of this urbanonym
"adjusted name", # adjusted spelling of the urbanonym in the source
"original name", # original spelling of the urbanonym in the source
"filename" # name of the file where the urbanonym is found
]
createFile(filenameStep2, fieldnamesStep2, "\t", "utf-8")
# list of urban names already assigned to a province per file to avoid double searches
filenameStep3 = "provincesdict.csv"
fieldnamesStep3 = ["original name", # original spelling of the urbanonym in the source
"filename", # name of the file where the urbanonym is found
"id", # GOV-ID of a place
"province" # name of assigned administrative unit
]
createFile(filenameStep3, fieldnamesStep3, "\t", "utf-8")
# start of parallelization
# executes the function "parallel" per entry in the list gedcomNamesListClear (per urbanonym)
# parallelization is realized to shorten the processing time
pool = Pool()
pool.map(partial(parallel,
miniGovList=miniGovList,
qualityDict=qualityDict,
fieldnamesStep1=fieldnamesStep1,
fieldnamesStep2=fieldnamesStep2,
fieldnamesStep3=fieldnamesStep3,
filenameStep1=filenameStep1,
filenameStep2=filenameStep2,
filenameStep3=filenameStep3, )
, gedcomNamesListClear)
pool.close()
pool.join()
# memorizing the time of finishing
finishtime = time.perf_counter()
# print the duration of the program run
print("Finished in", round(finishtime - starttime, 2), "seconds(s)")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment