From 68cc589789cd0936d9ad00cfb018ec585550d58d Mon Sep 17 00:00:00 2001 From: Marcus Baumgarten <baumgarten@hab.de> Date: Wed, 28 Sep 2022 10:55:40 +0000 Subject: [PATCH] Neue Datei hochladen --- qualitychecker.py | 384 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 384 insertions(+) create mode 100644 qualitychecker.py diff --git a/qualitychecker.py b/qualitychecker.py new file mode 100644 index 0000000..69a7725 --- /dev/null +++ b/qualitychecker.py @@ -0,0 +1,384 @@ +from haversine import haversine + + +def prePlaceFinder(data, minigov, fileName): + """ + This function creates a list of all unique urban names within a source. + The function works only with GEDCOM files as source (in this specification). + If other files are available as sources, an adjustment is necessary here. + :param data: content of one GEDCOM file + :param minigov: list of merged entries of the Mini-GOV + :return: list of uniquely identifiable locations (based on the name without context) + """ + placelist = [] + for gedcomRow in data: + if "PLAC" in gedcomRow: + # overwrite the row by deleting the tag information + # +5 because "PLAC" has four characters followed by a space + gedcomRow = gedcomRow[gedcomRow.find("PLAC") + 5:] + # searching in the Mini-GOV + minigovSearchResult = minigovSearch(minigov, gedcomRow) + rowInMiniGOV = minigovSearchResult[0] + # if there was a unique match, rowInMiniGOV is unequal -1 + if rowInMiniGOV != -1: + govId = minigov[rowInMiniGOV]["GOV-Kennung"] # GOV id of the detected place + longitude = minigov[rowInMiniGOV]["geographische Länge"] # longitude + latitude = minigov[rowInMiniGOV]["geographische Breite"] # latitude + else: # with no clear hit + govId = "NONE" + longitude = "NONE" + latitude = "NONE" + # extend a list of places each with a dictionary + placeDict = { + "place": gedcomRow, + "govid": govId, + "longitude": longitude, + "latitude": latitude, + "filename": fileName, + "numberHits": minigovSearchResult[1] # can be "1", ">1" or "<1" + } + placelist.append(placeDict) + return (placelist) + + +def gedcomRowParser(data, counter): + """ + The function parses GEDCOM rows into their individual components. + :param data: content of a GEDCOM file + :param counter: number of GEDCOM row + :return: list of four elements (first character, content behind first char, tag, content behind tag) + """ + # save first character + firstChar = data[counter][:1] + + # content after the first character excluding the first space + behindFirstChar = data[counter][2:] + + # parsing of the tag + # if there is no further text behind the tag, then there is no space + try: + tag = behindFirstChar[:(behindFirstChar.index(" "))] + except ValueError: + tag = behindFirstChar + + # content behind the tag + try: + behindTag = behindFirstChar[((behindFirstChar.index(" ")) + 1):] + except ValueError: + behindTag = "" + return ([firstChar, behindFirstChar, tag, behindTag]) + + +def minigovSearch(minigov, value): + """ + This function searches the Mini-GOV for location names. + :param minigov: list of merged entries of the Mini-GOV + :param value: name of the urbanonym + :return: List with two values (1. contains the line number in the Mini-GOV if the search result is unique, otherwise -1; 2. contains how many hits were found) + """ + # name of the column of the Mini-GOV to be searched + key = "aktueller Name" + + # initial base cleanup of the place name + # cut off everything from the first comma + try: + valueCleaned = value[:value.index(",")] + except ValueError: + valueCleaned = value + + # initialization of a list in which the line numbers of matching Mini-GOV entries are collected + hitsNumberList = [] + # initialization of a list in which the urbanonyms of matching Mini-GOV entries are collected + hitsUrbanonymList = [] + + # Binary search algorithm for searching the Mini-GOV + # initial position is the center of the Mini-GOV + position = int(len(minigov) / 2) + # position value of the previous iteration + # initially not 0, because this would lead to complex numbers in the formulas (roots of negative numbers) + previousPosition = len(minigov) + # search until the distance to the previous position is less than 10 + while (previousPosition - position) not in range(-10, 10): + previousPositionCache = position # temporary storage, because position changes and the previous value prevoiusPosition is still needed + if valueCleaned > minigov[position][key]: # alphabetical comparison + position = position + int( + abs(previousPosition - position) / 2) # amount of the difference between previousPosition and pos / 2 + elif valueCleaned < minigov[position][key]: # alphabetical comparison + position = position - int( + abs(previousPosition - position) / 2) # amount of the difference between previousPosition and pos / 2 + elif valueCleaned == minigov[position][key]: # alphabetical comparison, equalness + break; # it can not get any more precise than that, so do a break + previousPosition = previousPositionCache + # if a position was found, the 30 values above and below this position should be compared with valueCleaned + # no place name occurs 60 times, therefore the 30 is chosen + try: + minigov[position - 30][key] + start = position - 30 + except IndexError: # which occurs when the position is quite far ahead + start = 0 # then start at the beginning + try: + minigov[position + 30][key] + end = position + 30 + except IndexError: # which occurs when the number is quite far back + end = len(minigov) # then end the search at the end + # compare from start to finish if the value from the Mini-GOV matches the name of the source + for i in range(start, end): + if minigov[i][key] == valueCleaned: + hitsNumberList.append(i) + hitsUrbanonymList.append(valueCleaned) + + # if only one unique value is found, pass the line number from the Mini-GOV and the information that there was only one hit + if len(hitsNumberList) == 1: + return ([hitsNumberList[0], "1"]) + # with more than one hit it should be made clear with -1 that no clear hit was achieved + elif len(hitsNumberList) > 1: + return ([-1, ">1"]) + # with less than one hit, -1 should be used to indicate that no clear hit was achieved + elif len(hitsNumberList) < 1: + return ([-1, "0"]) + + +def qualityChecker(content, placelist, previousQualityData, filename): + """ + This function is used to get qualitative parameters for each GEDCOM file. + This includes, for example, the information about the number of persons. + In this program, the determined data is also called metadata of the source. + :param content: contents of the GEDCOM file + :param placelist: list of uniquely identifiable locations in the source + :param previousQualityData: source metadata from previous processing runs + :param filename: name of GEDCOM file + :return: list of quality parameters or a string as termination condition if the file has already been processed + """ + # prepare qualitychecker() + minOneFileIsMissing = 0 # binary variable for detection of missing files in result-csv; if is is 1, min. one gedcom file is not in the csv + + # check if the file has been edited before + # if nothing is found, qualityIndex is None, otherwise the number of the row is contained + qualityIndex = next((index for (index, d) in enumerate(previousQualityData) if d["filename"] == filename), None) + if qualityIndex is not None: + # define a string as termination condition + # if the file has already been edited once, it should not be edited a second time + quality = "StartingExitStrategy" + return (quality) + else: # file has not been edited yet + # initialising of variables + numberOfCoordinates = 0 # counter how many unique coordinates in file + numberOfPLAC = 0 # counts the number of PLAC tags + latitude = 0 + longitude = 0 + numberClusters = 1 # number of created clusters + haversineDict = { # for clustering, initial always (0,0) as first value + "coordinates": (0, 0), # it is okay, because it is in the sea and all places are far away + "cluster": 0, + "filename": 0 + } + clusterList = [haversineDict] + + # call each line of the GEDCOM file in sequence + for i in range(len(content)): + # parse a GEDCOM line + resultParser = gedcomRowParser(content, i) + tag = resultParser[2] + behindTag = resultParser[3] + + # cleanup of the content; removal of the content from the first comma + try: + behindTag = behindTag[:behindTag.index(",")] + except ValueError: + behindTag = behindTag + + # if they are urbanonyms, calculate average coordinates + if tag[:4] == "PLAC": + numberOfPLAC = numberOfPLAC + 1 + # formation of clusters of unique values + # compare with any unique location in placelist + # it is no problem to identify the placelist by the place name, because placelist has by definition only unique values (e.g. 2x Berlin does not work) + for placePlacelist in placelist: + # comparing + if behindTag == placePlacelist["place"] and placePlacelist["longitude"] != "NONE" and \ + placePlacelist["longitude"] != "": + # add coordinates and a number variable + longitude = longitude + float(placePlacelist["longitude"]) # are still strings + latitude = latitude + float(placePlacelist["latitude"]) + numberOfCoordinates = numberOfCoordinates + 1 + + # clustering of placePlacelist + clusterListCache = clusterList # otherwise the list in the loop itself extends infinitely + # list of possible clusters for a location + clusterAffiliation = [] # must be reset, because otherwise clusters are always found + # checking whether an existing cluster is less than 50 km away from a location + for singleCluster in clusterListCache: + if singleCluster[ + "cluster"] not in clusterAffiliation: # should not have to examine all other elements of the cluster + coordPlace = (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])) + coordMeanCluster = singleCluster["coordinates"] + # calculation of the distance in kilometers between location and possible other locations in clusters + distance = haversine(coordPlace, coordMeanCluster) + if distance <= 50: # in kilometers, at zero it is a place that already exists + # if the location is closer than 50 km to an existing cluster, the cluster is assigned to a list of possible clusters + # a location can belong to several clusters and thus connect them + clusterAffiliation.append(singleCluster["cluster"]) + + # with only one membership it will be added to the cluster + if len(clusterAffiliation) == 1: + haversineDict = { + "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])), + "cluster": clusterAffiliation[0], + "filename": placePlacelist["filename"] + } + clusterList.append(haversineDict) # add to existing cluster + # more than one cluster is close under 50 km (clusters are merged) + elif len(clusterAffiliation) > 1: + # select the cluster name to be kept, which is decisive (the lowest) + min = clusterAffiliation[0] + for singleClusterAffiliation in clusterAffiliation: + if singleClusterAffiliation < min: + min = singleClusterAffiliation + for singleClusterAffiliation in clusterAffiliation: + # all other cluster entries that are not min must be renamed to min + if singleClusterAffiliation != min: + for singleClusterList in clusterList: + if singleClusterList["cluster"] == singleClusterAffiliation: + singleClusterList["cluster"] = min # value vom dict ändern + # dthen create the new entry for the location + haversineDict = { + "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])), + "cluster": min, + "filename": placePlacelist["filename"] + } + clusterList.append(haversineDict) + # no affiliation to a cluster + # own cluster is created + elif len(clusterAffiliation) == 0: + haversineDict = { + "coordinates": (float(placePlacelist["latitude"]), float(placePlacelist["longitude"])), + "cluster": numberClusters, + "filename": placePlacelist["filename"] + } + clusterList.append(haversineDict) + numberClusters = numberClusters + 1 # count the total number of clusters + + # if there was a hit once, there can be no second hit, because placelist has only unique values; coordinates that occur twice are included twice in the calculation, because the whole part is executed multiple times + break + + # calculate average coordinates of whole source + if numberOfCoordinates != 0: # non-negative condition + longitude = longitude / numberOfCoordinates + latitude = latitude / numberOfCoordinates + else: + longitude = "NONE" + latitude = "NONE" + + # per GEDCOM file + # calculate number of different clusters + existingCluster = [] # list of assigned clusters + clusterMeanList = [] # list of averages of all clusters in a file for further processing + numberOfFinalCluster = 0 + # save only the numberClusters from the clusterlist + for singleClusterList in clusterList: + existingCluster.append(singleClusterList["cluster"]) + + # save only the coordinates from the clusterlist + # set a dot per file location + clusterLatLon = [] + for singleClusterList in clusterList: + clusterLatLon.append( + [singleClusterList["coordinates"][0], singleClusterList["coordinates"][1], 500]) # regulate intensity + + # calculation of the geographical center of the clusters in a file + # definition of a minimum size of locations in a cluster + minimumClusterSize = 6 # do not consider all clusters smaller or equal 5 + # per cluster, basic idea: compare every location with every cluster + # create numbers, which can be used as names for the clusters + for possibleClusterNumber in range(len(clusterList)): + # initialization for the calculation of averages + lat = 0 + long = 0 + numberLatLong = 0 + # add coordinates of matching clusters together + for singleClusterList in clusterList: + if singleClusterList["cluster"] == possibleClusterNumber: + lat = lat + singleClusterList["coordinates"][0] + long = long + singleClusterList["coordinates"][1] + numberLatLong = numberLatLong + 1 + # via numberLatLong you can exclude small clusters; must be at least 1; must be at least 1 + # only for clusters that really exist, therefore at least 1 + if numberLatLong >= minimumClusterSize: # must go here, because otherwise the divider will be distorted and also clusters will be applied where there is no cluster entry (e.g. 23) + lat = lat / (numberLatLong) # non-negative + long = long / (numberLatLong) # non-negative + # the list is used for further calculations to determine/cluster locations + clusterMeanList.append([lat, long]) + + # counting of left clusters (cluster with the minimum size) + existingCluster = stringDuplicateCounter(existingCluster) + for singleExistingCLuster in existingCluster: + if singleExistingCLuster[1] >= minimumClusterSize: + numberOfFinalCluster = numberOfFinalCluster + 1 + + # counting hits + noHit = 0 + moreThanOneHit = 0 + oneHit = 0 + for singlePlacelist in placelist: + if singlePlacelist["numberHits"] == "1": + oneHit = oneHit + 1 # in contrast to numberOfCoordinates also those without coordinates + elif singlePlacelist["numberHits"] == "0": + noHit = noHit + 1 + elif singlePlacelist["numberHits"] == ">1": + moreThanOneHit = moreThanOneHit + 1 + + # generation of a dictionary for all meta data + quality = { + "filename": filename, + "longitude mean of of definitely coordinates": longitude, + "latitude mean of of definitely coordinates": latitude, + "number of places": numberOfPLAC, + "number of noHit": noHit, + "number of moreThanOneHit": moreThanOneHit, + "number of definitely coordinates": oneHit, + "number of existing clusters": (len(existingCluster) - 1), # minus 1, because cluster is initial 0 + "number of relevant clusters": numberOfFinalCluster, + "cluster midpoints": clusterMeanList + } + return (quality) + + +def stringDuplicateCounter(list): + """ + This function is used to count equal values (duplicates) in lists. + :param list: list to be examined (one column) + :return: list of lists containing the name and number of each element in the list + """ + newList = [] + attribute = [] + # examine each element of "list" + for counter, i in enumerate(list): # i is an element of the list + # for each new element this step is performed + if i not in attribute: + # count the number of these elements in the list + doublingCounter = 0 + for y in list: + if i == y: + doublingCounter = doublingCounter + 1 + newList.append([[i], doublingCounter]) + attribute.append(i) + # alphabetical sorting + newList.sort() + return (newList) + + +def mainMetadataInspector(line, filename, miniGovList, previousQualityData): + """ + This function first initializes the creation of a list of unique location information in a source. + Afterwards it is used to achieve a further analysis of metadata or qualitative features. + :param line: contents of the GEDCOM file + :param filename: name of the file/source + :param miniGovList: list of merged entries of the Mini-GOV + :param previousQualityData: source metadata from previous processing runs + :return: content for one line of the file "quality.csv" + """ + # creation of a list with unique places + placelist = prePlaceFinder(line, miniGovList, filename) # placelist is list of unique locations with coordinates + # metadata analysis, calculate some parameters + quality = qualityChecker(line, placelist, previousQualityData, filename) + return (quality) -- GitLab