Neue Datei hochladen

68cc5897 · Marcus Baumgarten · 43edfb81 · 68cc5897
Commit 68cc5897 authored 2 years ago by Marcus Baumgarten
--- a/qualitychecker.py
+++ b/qualitychecker.py
+from haversine import haversine
+
+
+def prePlaceFinder(data, minigov, fileName):
+    """
+    This function creates a list of all unique urban names within a source.
+    The function works only with GEDCOM files as source (in this specification).
+    If other files are available as sources, an adjustment is necessary here.
+    :param data: content of one GEDCOM file
+    :param minigov: list of merged entries of the Mini-GOV
+    :return: list of uniquely identifiable locations (based on the name without context)
+    """
+    placelist = []
+    for gedcomRow in data:
+        if "PLAC" in gedcomRow:
+            # overwrite the row by deleting the tag information
+            # +5 because "PLAC" has four characters followed by a space
+            gedcomRow = gedcomRow[gedcomRow.find("PLAC") + 5:]
+            # searching in the Mini-GOV
+            minigovSearchResult = minigovSearch(minigov, gedcomRow)
+            rowInMiniGOV = minigovSearchResult[0]
+            # if there was a unique match, rowInMiniGOV is unequal -1
+            if rowInMiniGOV != -1:
+                govId = minigov[rowInMiniGOV]["GOV-Kennung"]  # GOV id of the detected place
+                longitude = minigov[rowInMiniGOV]["geographische Länge"]  # longitude
+                latitude = minigov[rowInMiniGOV]["geographische Breite"]  # latitude
+            else:  # with no clear hit
+                govId = "NONE"
+                longitude = "NONE"
+                latitude = "NONE"
+            # extend a list of places each with a dictionary
+            placeDict = {
+                "place": gedcomRow,
+                "govid": govId,
+                "longitude": longitude,
+                "latitude": latitude,
+                "filename": fileName,
+                "numberHits": minigovSearchResult[1]  # can be "1", ">1" or "<1"
+            }
+            placelist.append(placeDict)
+    return (placelist)
+
+
+def gedcomRowParser(data, counter):
+    """
+    The function parses GEDCOM rows into their individual components.
+    :param data: content of a GEDCOM file
+    :param counter: number of GEDCOM row
+    :return: list of four elements (first character, content behind first char, tag, content behind tag)
+    """
+    # save first character
+    firstChar = data[counter][:1]
+
+    # content after the first character excluding the first space
+    behindFirstChar = data[counter][2:]
+
+    # parsing of the tag
+    # if there is no further text behind the tag, then there is no space
+    try:
+        tag = behindFirstChar[:(behindFirstChar.index(" "))]
+    except ValueError:
+        tag = behindFirstChar
+
+    # content behind the tag
+    try:
+        behindTag = behindFirstChar[((behindFirstChar.index(" ")) + 1):]
+    except ValueError:
+        behindTag = ""
+    return ([firstChar, behindFirstChar, tag, behindTag])
+
+
+def minigovSearch(minigov, value):
+    """
+    This function searches the Mini-GOV for location names.
+    :param minigov: list of merged entries of the Mini-GOV
+    :param value: name of the urbanonym
+    :return: List with two values (1. contains the line number in the Mini-GOV if the search result is unique, otherwise -1; 2. contains how many hits were found)
+    """
+    # name of the column of the Mini-GOV to be searched
+    key = "aktueller Name"
+
+    # initial base cleanup of the place name
+    # cut off everything from the first comma
+    try:
+        valueCleaned = value[:value.index(",")]
+    except ValueError:
+        valueCleaned = value
+
+    # initialization of a list in which the line numbers of matching Mini-GOV entries are collected
+    hitsNumberList = []
+    # initialization of a list in which the urbanonyms of matching Mini-GOV entries are collected
+    hitsUrbanonymList = []
+
+    # Binary search algorithm for searching the Mini-GOV
+    # initial position is the center of the Mini-GOV
+    position = int(len(minigov) / 2)
+    # position value of the previous iteration
+    # initially not 0, because this would lead to complex numbers in the formulas (roots of negative numbers)
+    previousPosition = len(minigov)
+    # search until the distance to the previous position is less than 10
+    while (previousPosition - position) not in range(-10, 10):
+        previousPositionCache = position  # temporary storage, because position changes and the previous value prevoiusPosition is still needed
+        if valueCleaned > minigov[position][key]:  # alphabetical comparison
+            position = position + int(
+                abs(previousPosition - position) / 2)  # amount of the difference between previousPosition and pos / 2
+        elif valueCleaned < minigov[position][key]:  # alphabetical comparison
+            position = position - int(
+                abs(previousPosition - position) / 2)  # amount of the difference between previousPosition and pos / 2
+        elif valueCleaned == minigov[position][key]:  # alphabetical comparison, equalness
+            break;  # it can not get any more precise than that, so do a break
+        previousPosition = previousPositionCache
+    # if a position was found, the 30 values above and below this position should be compared with valueCleaned
+    # no place name occurs 60 times, therefore the 30 is chosen
+    try:
+        minigov[position - 30][key]
+        start = position - 30
+    except IndexError:  # which occurs when the position is quite far ahead
+        start = 0  # then start at the beginning
+    try:
+        minigov[position + 30][key]
+        end = position + 30
+    except IndexError:  # which occurs when the number is quite far back
+        end = len(minigov)  # then end the search at the end
+    # compare from start to finish if the value from the Mini-GOV matches the name of the source
+    for i in range(start, end):
+        if minigov[i][key] == valueCleaned:
+            hitsNumberList.append(i)
+            hitsUrbanonymList.append(valueCleaned)
+
+    # if only one unique value is found, pass the line number from the Mini-GOV and the information that there was only one hit
+    if len(hitsNumberList) == 1:
+        return ([hitsNumberList[0], "1"])
+    # with more than one hit it should be made clear with -1 that no clear hit was achieved
+    elif len(hitsNumberList) > 1:
+        return ([-1, ">1"])
+    # with less than one hit, -1 should be used to indicate that no clear hit was achieved
+    elif len(hitsNumberList) < 1:
+        return ([-1, "0"])
+
+
+def qualityChecker(content, placelist, previousQualityData, filename):
+    """
+    This function is used to get qualitative parameters for each GEDCOM file.
+    This includes, for example, the information about the number of persons.
+    In this program, the determined data is also called metadata of the source.
+    :param content: contents of the GEDCOM file
+    :param placelist: list of uniquely identifiable locations in the source
+    :param previousQualityData: source metadata from previous processing runs
+    :param filename: name of GEDCOM file
+    :return: list of quality parameters or a string as termination condition if the file has already been processed
+    """
+    # prepare qualitychecker()
+    minOneFileIsMissing = 0  # binary variable for detection of missing files in result-csv; if is is 1, min. one gedcom file is not in the csv
+
+    # check if the file has been edited before
+    # if nothing is found, qualityIndex is None, otherwise the number of the row is contained
+    qualityIndex = next((index for (index, d) in enumerate(previousQualityData) if d["filename"] == filename), None)
+    if qualityIndex is not None:
+        # define a string as termination condition
+        # if the file has already been edited once, it should not be edited a second time
+        quality = "StartingExitStrategy"
+        return (quality)
+    else:  # file has not been edited yet
+        # initialising of variables
+        numberOfCoordinates = 0  # counter how many unique coordinates in file
+        numberOfPLAC = 0  # counts the number of PLAC tags
+        latitude = 0
+        longitude = 0
+        numberClusters = 1  # number of created clusters
+        haversineDict = {  # for clustering, initial always (0,0) as first value
+            "coordinates": (0, 0),  # it is okay, because it is in the sea and all places are far away
+            "cluster": 0,
+            "filename": 0
+        }
+        clusterList = [haversineDict]
+
+        # call each line of the GEDCOM file in sequence
+        for i in range(len(content)):
+            # parse a GEDCOM line
+            resultParser = gedcomRowParser(content, i)
+            tag = resultParser[2]
+            behindTag = resultParser[3]
+
+            # cleanup of the content; removal of the content from the first comma
+            try:
+                behindTag = behindTag[:behindTag.index(",")]
+            except ValueError:
+                behindTag = behindTag
+
+            # if they are urbanonyms, calculate average coordinates
+            if tag[:4] == "PLAC":
+                numberOfPLAC = numberOfPLAC + 1
+                # formation of clusters of unique values
+                # compare with any unique location in placelist
+                # it is no problem to identify the placelist by the place name, because placelist has by definition only unique values (e.g. 2x Berlin does not work)
+                for placePlacelist in placelist:
+                    # comparing
+                    if behindTag == placePlacelist["place"] and placePlacelist["longitude"] != "NONE" and \
+                            placePlacelist["longitude"] != "":
+                        # add coordinates and a number variable
+                        longitude = longitude + float(placePlacelist["longitude"])  # are still strings
+                        latitude = latitude + float(placePlacelist["latitude"])
+                        numberOfCoordinates = numberOfCoordinates + 1
+
+                        # clustering of placePlacelist
+                        clusterListCache = clusterList  # otherwise the list in the loop itself extends infinitely
+                        # list of possible clusters for a location
+                        clusterAffiliation = []  # must be reset, because otherwise clusters are always found
+                        # checking whether an existing cluster is less than 50 km away from a location
+                        for singleCluster in clusterListCache:
+                            if singleCluster[
+                                "cluster"] not in clusterAffiliation:  # should not have to examine all other elements of the cluster
+                                coordPlace = (float(placePlacelist["latitude"]), float(placePlacelist["longitude"]))
+                                coordMeanCluster = singleCluster["coordinates"]
+                                # calculation of the distance in kilometers between location and possible other locations in clusters
+                                distance = haversine(coordPlace, coordMeanCluster)
+                                if distance <= 50:  # in kilometers, at zero it is a place that already exists
+                                    # if the location is closer than 50 km to an existing cluster, the cluster is assigned to a list of possible clusters
+                                    # a location can belong to several clusters and thus connect them
+                                    clusterAffiliation.append(singleCluster["cluster"])
+
+                        # with only one membership it will be added to the cluster
+                        if len(clusterAffiliation) == 1:
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": clusterAffiliation[0],
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)  # add to existing cluster
+                        # more than one cluster is close under 50 km (clusters are merged)
+                        elif len(clusterAffiliation) > 1:
+                            # select the cluster name to be kept, which is decisive (the lowest)
+                            min = clusterAffiliation[0]
+                            for singleClusterAffiliation in clusterAffiliation:
+                                if singleClusterAffiliation < min:
+                                    min = singleClusterAffiliation
+                            for singleClusterAffiliation in clusterAffiliation:
+                                # all other cluster entries that are not min must be renamed to min
+                                if singleClusterAffiliation != min:
+                                    for singleClusterList in clusterList:
+                                        if singleClusterList["cluster"] == singleClusterAffiliation:
+                                            singleClusterList["cluster"] = min  # value vom dict ändern
+                            # dthen create the new entry for the location
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": min,
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)
+                        # no affiliation to a cluster
+                        # own cluster is created
+                        elif len(clusterAffiliation) == 0:
+                            haversineDict = {
+                                "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])),
+                                "cluster": numberClusters,
+                                "filename": placePlacelist["filename"]
+                            }
+                            clusterList.append(haversineDict)
+                            numberClusters = numberClusters + 1  # count the total number of clusters
+
+                        # if there was a hit once, there can be no second hit, because placelist has only unique values; coordinates that occur twice are included twice in the calculation, because the whole part is executed multiple times
+                        break
+
+        # calculate average coordinates of whole source
+        if numberOfCoordinates != 0:  # non-negative condition
+            longitude = longitude / numberOfCoordinates
+            latitude = latitude / numberOfCoordinates
+        else:
+            longitude = "NONE"
+            latitude = "NONE"
+
+        # per GEDCOM file
+        # calculate number of different clusters
+        existingCluster = []  # list of assigned clusters
+        clusterMeanList = []  # list of averages of all clusters in a file for further processing
+        numberOfFinalCluster = 0
+        # save only the numberClusters from the clusterlist
+        for singleClusterList in clusterList:
+            existingCluster.append(singleClusterList["cluster"])
+
+        # save only the coordinates from the clusterlist
+        # set a dot per file location
+        clusterLatLon = []
+        for singleClusterList in clusterList:
+            clusterLatLon.append(
+                [singleClusterList["coordinates"][0], singleClusterList["coordinates"][1], 500])  # regulate intensity
+
+        # calculation of the geographical center of the clusters in a file
+        # definition of a minimum size of locations in a cluster
+        minimumClusterSize = 6  # do not consider all clusters smaller or equal 5
+        # per cluster, basic idea: compare every location with every cluster
+        # create numbers, which can be used as names for the clusters
+        for possibleClusterNumber in range(len(clusterList)):
+            # initialization for the calculation of averages
+            lat = 0
+            long = 0
+            numberLatLong = 0
+            # add coordinates of matching clusters together
+            for singleClusterList in clusterList:
+                if singleClusterList["cluster"] == possibleClusterNumber:
+                    lat = lat + singleClusterList["coordinates"][0]
+                    long = long + singleClusterList["coordinates"][1]
+                    numberLatLong = numberLatLong + 1
+            # via numberLatLong you can exclude small clusters; must be at least 1; must be at least 1
+            # only for clusters that really exist, therefore at least 1
+            if numberLatLong >= minimumClusterSize:  # must go here, because otherwise the divider will be distorted and also clusters will be applied where there is no cluster entry (e.g. 23)
+                lat = lat / (numberLatLong)  # non-negative
+                long = long / (numberLatLong)  # non-negative
+                # the list is used for further calculations to determine/cluster locations
+                clusterMeanList.append([lat, long])
+
+        # counting of left clusters (cluster with the minimum size)
+        existingCluster = stringDuplicateCounter(existingCluster)
+        for singleExistingCLuster in existingCluster:
+            if singleExistingCLuster[1] >= minimumClusterSize:
+                numberOfFinalCluster = numberOfFinalCluster + 1
+
+        # counting hits
+        noHit = 0
+        moreThanOneHit = 0
+        oneHit = 0
+        for singlePlacelist in placelist:
+            if singlePlacelist["numberHits"] == "1":
+                oneHit = oneHit + 1  # in contrast to numberOfCoordinates also those without coordinates
+            elif singlePlacelist["numberHits"] == "0":
+                noHit = noHit + 1
+            elif singlePlacelist["numberHits"] == ">1":
+                moreThanOneHit = moreThanOneHit + 1
+
+        # generation of a dictionary for all meta data
+        quality = {
+            "filename": filename,
+            "longitude mean of of definitely coordinates": longitude,
+            "latitude mean of of definitely coordinates": latitude,
+            "number of places": numberOfPLAC,
+            "number of noHit": noHit,
+            "number of moreThanOneHit": moreThanOneHit,
+            "number of definitely coordinates": oneHit,
+            "number of existing clusters": (len(existingCluster) - 1),  # minus 1, because cluster is initial 0
+            "number of relevant clusters": numberOfFinalCluster,
+            "cluster midpoints": clusterMeanList
+        }
+    return (quality)
+
+
+def stringDuplicateCounter(list):
+    """
+    This function is used to count equal values (duplicates) in lists.
+    :param list: list to be examined (one column)
+    :return: list of lists containing the name and number of each element in the list
+    """
+    newList = []
+    attribute = []
+    # examine each element of "list"
+    for counter, i in enumerate(list):  # i is an element of the list
+        # for each new element this step is performed
+        if i not in attribute:
+            # count the number of these elements in the list
+            doublingCounter = 0
+            for y in list:
+                if i == y:
+                    doublingCounter = doublingCounter + 1
+            newList.append([[i], doublingCounter])
+            attribute.append(i)
+    # alphabetical sorting
+    newList.sort()
+    return (newList)
+
+
+def mainMetadataInspector(line, filename, miniGovList, previousQualityData):
+    """
+    This function first initializes the creation of a list of unique location information in a source.
+    Afterwards it is used to achieve a further analysis of metadata or qualitative features.
+    :param line: contents of the GEDCOM file
+    :param filename: name of the file/source
+    :param miniGovList: list of merged entries of the Mini-GOV
+    :param previousQualityData: source metadata from previous processing runs
+    :return: content for one line of the file "quality.csv"
+    """
+    # creation of a list with unique places
+    placelist = prePlaceFinder(line, miniGovList, filename)  # placelist is list of unique locations with coordinates
+    # metadata analysis, calculate some parameters
+    quality = qualityChecker(line, placelist, previousQualityData, filename)
+    return (quality)