Skip to content
Snippets Groups Projects
Commit 68cc5897 authored by Marcus Baumgarten's avatar Marcus Baumgarten
Browse files

Neue Datei hochladen

parent 43edfb81
No related branches found
No related tags found
No related merge requests found
from haversine import haversine
def prePlaceFinder(data, minigov, fileName):
"""
This function creates a list of all unique urban names within a source.
The function works only with GEDCOM files as source (in this specification).
If other files are available as sources, an adjustment is necessary here.
:param data: content of one GEDCOM file
:param minigov: list of merged entries of the Mini-GOV
:return: list of uniquely identifiable locations (based on the name without context)
"""
placelist = []
for gedcomRow in data:
if "PLAC" in gedcomRow:
# overwrite the row by deleting the tag information
# +5 because "PLAC" has four characters followed by a space
gedcomRow = gedcomRow[gedcomRow.find("PLAC") + 5:]
# searching in the Mini-GOV
minigovSearchResult = minigovSearch(minigov, gedcomRow)
rowInMiniGOV = minigovSearchResult[0]
# if there was a unique match, rowInMiniGOV is unequal -1
if rowInMiniGOV != -1:
govId = minigov[rowInMiniGOV]["GOV-Kennung"] # GOV id of the detected place
longitude = minigov[rowInMiniGOV]["geographische Länge"] # longitude
latitude = minigov[rowInMiniGOV]["geographische Breite"] # latitude
else: # with no clear hit
govId = "NONE"
longitude = "NONE"
latitude = "NONE"
# extend a list of places each with a dictionary
placeDict = {
"place": gedcomRow,
"govid": govId,
"longitude": longitude,
"latitude": latitude,
"filename": fileName,
"numberHits": minigovSearchResult[1] # can be "1", ">1" or "<1"
}
placelist.append(placeDict)
return (placelist)
def gedcomRowParser(data, counter):
"""
The function parses GEDCOM rows into their individual components.
:param data: content of a GEDCOM file
:param counter: number of GEDCOM row
:return: list of four elements (first character, content behind first char, tag, content behind tag)
"""
# save first character
firstChar = data[counter][:1]
# content after the first character excluding the first space
behindFirstChar = data[counter][2:]
# parsing of the tag
# if there is no further text behind the tag, then there is no space
try:
tag = behindFirstChar[:(behindFirstChar.index(" "))]
except ValueError:
tag = behindFirstChar
# content behind the tag
try:
behindTag = behindFirstChar[((behindFirstChar.index(" ")) + 1):]
except ValueError:
behindTag = ""
return ([firstChar, behindFirstChar, tag, behindTag])
def minigovSearch(minigov, value):
"""
This function searches the Mini-GOV for location names.
:param minigov: list of merged entries of the Mini-GOV
:param value: name of the urbanonym
:return: List with two values (1. contains the line number in the Mini-GOV if the search result is unique, otherwise -1; 2. contains how many hits were found)
"""
# name of the column of the Mini-GOV to be searched
key = "aktueller Name"
# initial base cleanup of the place name
# cut off everything from the first comma
try:
valueCleaned = value[:value.index(",")]
except ValueError:
valueCleaned = value
# initialization of a list in which the line numbers of matching Mini-GOV entries are collected
hitsNumberList = []
# initialization of a list in which the urbanonyms of matching Mini-GOV entries are collected
hitsUrbanonymList = []
# Binary search algorithm for searching the Mini-GOV
# initial position is the center of the Mini-GOV
position = int(len(minigov) / 2)
# position value of the previous iteration
# initially not 0, because this would lead to complex numbers in the formulas (roots of negative numbers)
previousPosition = len(minigov)
# search until the distance to the previous position is less than 10
while (previousPosition - position) not in range(-10, 10):
previousPositionCache = position # temporary storage, because position changes and the previous value prevoiusPosition is still needed
if valueCleaned > minigov[position][key]: # alphabetical comparison
position = position + int(
abs(previousPosition - position) / 2) # amount of the difference between previousPosition and pos / 2
elif valueCleaned < minigov[position][key]: # alphabetical comparison
position = position - int(
abs(previousPosition - position) / 2) # amount of the difference between previousPosition and pos / 2
elif valueCleaned == minigov[position][key]: # alphabetical comparison, equalness
break; # it can not get any more precise than that, so do a break
previousPosition = previousPositionCache
# if a position was found, the 30 values above and below this position should be compared with valueCleaned
# no place name occurs 60 times, therefore the 30 is chosen
try:
minigov[position - 30][key]
start = position - 30
except IndexError: # which occurs when the position is quite far ahead
start = 0 # then start at the beginning
try:
minigov[position + 30][key]
end = position + 30
except IndexError: # which occurs when the number is quite far back
end = len(minigov) # then end the search at the end
# compare from start to finish if the value from the Mini-GOV matches the name of the source
for i in range(start, end):
if minigov[i][key] == valueCleaned:
hitsNumberList.append(i)
hitsUrbanonymList.append(valueCleaned)
# if only one unique value is found, pass the line number from the Mini-GOV and the information that there was only one hit
if len(hitsNumberList) == 1:
return ([hitsNumberList[0], "1"])
# with more than one hit it should be made clear with -1 that no clear hit was achieved
elif len(hitsNumberList) > 1:
return ([-1, ">1"])
# with less than one hit, -1 should be used to indicate that no clear hit was achieved
elif len(hitsNumberList) < 1:
return ([-1, "0"])
def qualityChecker(content, placelist, previousQualityData, filename):
"""
This function is used to get qualitative parameters for each GEDCOM file.
This includes, for example, the information about the number of persons.
In this program, the determined data is also called metadata of the source.
:param content: contents of the GEDCOM file
:param placelist: list of uniquely identifiable locations in the source
:param previousQualityData: source metadata from previous processing runs
:param filename: name of GEDCOM file
:return: list of quality parameters or a string as termination condition if the file has already been processed
"""
# prepare qualitychecker()
minOneFileIsMissing = 0 # binary variable for detection of missing files in result-csv; if is is 1, min. one gedcom file is not in the csv
# check if the file has been edited before
# if nothing is found, qualityIndex is None, otherwise the number of the row is contained
qualityIndex = next((index for (index, d) in enumerate(previousQualityData) if d["filename"] == filename), None)
if qualityIndex is not None:
# define a string as termination condition
# if the file has already been edited once, it should not be edited a second time
quality = "StartingExitStrategy"
return (quality)
else: # file has not been edited yet
# initialising of variables
numberOfCoordinates = 0 # counter how many unique coordinates in file
numberOfPLAC = 0 # counts the number of PLAC tags
latitude = 0
longitude = 0
numberClusters = 1 # number of created clusters
haversineDict = { # for clustering, initial always (0,0) as first value
"coordinates": (0, 0), # it is okay, because it is in the sea and all places are far away
"cluster": 0,
"filename": 0
}
clusterList = [haversineDict]
# call each line of the GEDCOM file in sequence
for i in range(len(content)):
# parse a GEDCOM line
resultParser = gedcomRowParser(content, i)
tag = resultParser[2]
behindTag = resultParser[3]
# cleanup of the content; removal of the content from the first comma
try:
behindTag = behindTag[:behindTag.index(",")]
except ValueError:
behindTag = behindTag
# if they are urbanonyms, calculate average coordinates
if tag[:4] == "PLAC":
numberOfPLAC = numberOfPLAC + 1
# formation of clusters of unique values
# compare with any unique location in placelist
# it is no problem to identify the placelist by the place name, because placelist has by definition only unique values (e.g. 2x Berlin does not work)
for placePlacelist in placelist:
# comparing
if behindTag == placePlacelist["place"] and placePlacelist["longitude"] != "NONE" and \
placePlacelist["longitude"] != "":
# add coordinates and a number variable
longitude = longitude + float(placePlacelist["longitude"]) # are still strings
latitude = latitude + float(placePlacelist["latitude"])
numberOfCoordinates = numberOfCoordinates + 1
# clustering of placePlacelist
clusterListCache = clusterList # otherwise the list in the loop itself extends infinitely
# list of possible clusters for a location
clusterAffiliation = [] # must be reset, because otherwise clusters are always found
# checking whether an existing cluster is less than 50 km away from a location
for singleCluster in clusterListCache:
if singleCluster[
"cluster"] not in clusterAffiliation: # should not have to examine all other elements of the cluster
coordPlace = (float(placePlacelist["latitude"]), float(placePlacelist["longitude"]))
coordMeanCluster = singleCluster["coordinates"]
# calculation of the distance in kilometers between location and possible other locations in clusters
distance = haversine(coordPlace, coordMeanCluster)
if distance <= 50: # in kilometers, at zero it is a place that already exists
# if the location is closer than 50 km to an existing cluster, the cluster is assigned to a list of possible clusters
# a location can belong to several clusters and thus connect them
clusterAffiliation.append(singleCluster["cluster"])
# with only one membership it will be added to the cluster
if len(clusterAffiliation) == 1:
haversineDict = {
"coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
"cluster": clusterAffiliation[0],
"filename": placePlacelist["filename"]
}
clusterList.append(haversineDict) # add to existing cluster
# more than one cluster is close under 50 km (clusters are merged)
elif len(clusterAffiliation) > 1:
# select the cluster name to be kept, which is decisive (the lowest)
min = clusterAffiliation[0]
for singleClusterAffiliation in clusterAffiliation:
if singleClusterAffiliation < min:
min = singleClusterAffiliation
for singleClusterAffiliation in clusterAffiliation:
# all other cluster entries that are not min must be renamed to min
if singleClusterAffiliation != min:
for singleClusterList in clusterList:
if singleClusterList["cluster"] == singleClusterAffiliation:
singleClusterList["cluster"] = min # value vom dict ändern
# dthen create the new entry for the location
haversineDict = {
"coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
"cluster": min,
"filename": placePlacelist["filename"]
}
clusterList.append(haversineDict)
# no affiliation to a cluster
# own cluster is created
elif len(clusterAffiliation) == 0:
haversineDict = {
"coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
"cluster": numberClusters,
"filename": placePlacelist["filename"]
}
clusterList.append(haversineDict)
numberClusters = numberClusters + 1 # count the total number of clusters
# if there was a hit once, there can be no second hit, because placelist has only unique values; coordinates that occur twice are included twice in the calculation, because the whole part is executed multiple times
break
# calculate average coordinates of whole source
if numberOfCoordinates != 0: # non-negative condition
longitude = longitude / numberOfCoordinates
latitude = latitude / numberOfCoordinates
else:
longitude = "NONE"
latitude = "NONE"
# per GEDCOM file
# calculate number of different clusters
existingCluster = [] # list of assigned clusters
clusterMeanList = [] # list of averages of all clusters in a file for further processing
numberOfFinalCluster = 0
# save only the numberClusters from the clusterlist
for singleClusterList in clusterList:
existingCluster.append(singleClusterList["cluster"])
# save only the coordinates from the clusterlist
# set a dot per file location
clusterLatLon = []
for singleClusterList in clusterList:
clusterLatLon.append(
[singleClusterList["coordinates"][0], singleClusterList["coordinates"][1], 500]) # regulate intensity
# calculation of the geographical center of the clusters in a file
# definition of a minimum size of locations in a cluster
minimumClusterSize = 6 # do not consider all clusters smaller or equal 5
# per cluster, basic idea: compare every location with every cluster
# create numbers, which can be used as names for the clusters
for possibleClusterNumber in range(len(clusterList)):
# initialization for the calculation of averages
lat = 0
long = 0
numberLatLong = 0
# add coordinates of matching clusters together
for singleClusterList in clusterList:
if singleClusterList["cluster"] == possibleClusterNumber:
lat = lat + singleClusterList["coordinates"][0]
long = long + singleClusterList["coordinates"][1]
numberLatLong = numberLatLong + 1
# via numberLatLong you can exclude small clusters; must be at least 1; must be at least 1
# only for clusters that really exist, therefore at least 1
if numberLatLong >= minimumClusterSize: # must go here, because otherwise the divider will be distorted and also clusters will be applied where there is no cluster entry (e.g. 23)
lat = lat / (numberLatLong) # non-negative
long = long / (numberLatLong) # non-negative
# the list is used for further calculations to determine/cluster locations
clusterMeanList.append([lat, long])
# counting of left clusters (cluster with the minimum size)
existingCluster = stringDuplicateCounter(existingCluster)
for singleExistingCLuster in existingCluster:
if singleExistingCLuster[1] >= minimumClusterSize:
numberOfFinalCluster = numberOfFinalCluster + 1
# counting hits
noHit = 0
moreThanOneHit = 0
oneHit = 0
for singlePlacelist in placelist:
if singlePlacelist["numberHits"] == "1":
oneHit = oneHit + 1 # in contrast to numberOfCoordinates also those without coordinates
elif singlePlacelist["numberHits"] == "0":
noHit = noHit + 1
elif singlePlacelist["numberHits"] == ">1":
moreThanOneHit = moreThanOneHit + 1
# generation of a dictionary for all meta data
quality = {
"filename": filename,
"longitude mean of of definitely coordinates": longitude,
"latitude mean of of definitely coordinates": latitude,
"number of places": numberOfPLAC,
"number of noHit": noHit,
"number of moreThanOneHit": moreThanOneHit,
"number of definitely coordinates": oneHit,
"number of existing clusters": (len(existingCluster) - 1), # minus 1, because cluster is initial 0
"number of relevant clusters": numberOfFinalCluster,
"cluster midpoints": clusterMeanList
}
return (quality)
def stringDuplicateCounter(list):
"""
This function is used to count equal values (duplicates) in lists.
:param list: list to be examined (one column)
:return: list of lists containing the name and number of each element in the list
"""
newList = []
attribute = []
# examine each element of "list"
for counter, i in enumerate(list): # i is an element of the list
# for each new element this step is performed
if i not in attribute:
# count the number of these elements in the list
doublingCounter = 0
for y in list:
if i == y:
doublingCounter = doublingCounter + 1
newList.append([[i], doublingCounter])
attribute.append(i)
# alphabetical sorting
newList.sort()
return (newList)
def mainMetadataInspector(line, filename, miniGovList, previousQualityData):
"""
This function first initializes the creation of a list of unique location information in a source.
Afterwards it is used to achieve a further analysis of metadata or qualitative features.
:param line: contents of the GEDCOM file
:param filename: name of the file/source
:param miniGovList: list of merged entries of the Mini-GOV
:param previousQualityData: source metadata from previous processing runs
:return: content for one line of the file "quality.csv"
"""
# creation of a list with unique places
placelist = prePlaceFinder(line, miniGovList, filename) # placelist is list of unique locations with coordinates
# metadata analysis, calculate some parameters
quality = qualityChecker(line, placelist, previousQualityData, filename)
return (quality)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment