filter_data.py¶

Bases: object

Returns:

Type	Description
`FilterData`	An instance of this object

Source code in osm_fieldwork/filter_data.py

def __init__(
    self,
    filespec: str = None,
    config: QueryConfig = None,
):
    """Args:
        filespec (str): The optional data file to read.

    Returns:
        (FilterData): An instance of this object
    """
    self.tags = dict()
    self.qc = config
    if filespec and config:
        self.parse(filespec, config)

parse ¶

parse(filespec, config)

Read in the XLSForm and extract the data we want.

Parameters:

Name	Type	Description	Default
`filespec`	`str`	The filespec to the XLSForm file	required

Returns:

Name	Type	Description
`title`	`str`	The title from the XLSForm Setting sheet
`extract`	`str`	The data extract filename from the XLSForm Survey sheet

Source code in osm_fieldwork/filter_data.py

def parse(
    self,
    filespec: str,
    config: QueryConfig,
):
    """Read in the XLSForm and extract the data we want.

    Args:
        filespec (str): The filespec to the XLSForm file

    Returns:
        title (str): The title from the XLSForm Setting sheet
        extract (str): The data extract filename from the XLSForm Survey sheet
    """
    if config:
        self.qc = config
    excel_object = pd.ExcelFile(filespec)
    entries = excel_object.parse(sheet_name=[0, 1, 2], index_col=0, usercols=[0, 1, 2])
    entries = pd.read_excel(filespec, sheet_name=[0, 1, 2])
    title = entries[2]["form_title"].to_list()[0]
    extract = ""
    for entry in entries[0]["type"]:
        if str(entry) == "nan":
            continue
        if entry[:20] == "select_one_from_file":
            extract = entry[21:]
            log.info(f'Got data extract filename: "{extract}", title: "{title}"')
        else:
            extract = "none"
    total = len(entries[1]["list_name"])
    index = 1
    while index < total:
        key = entries[1]["list_name"][index]
        if key == "model" or str(key) == "nan":
            index += 1
            continue
        value = entries[1]["name"][index]
        if value == "<text>" or str(value) == "null":
            index += 1
            continue
        if key not in self.tags:
            self.tags[key] = list()
        self.tags[key].append(value)
        index += 1

    # The yaml config file for the query has a list of columns
    # to keep in addition to this default set. These wind up
    # in the SELECT
    keep = (
        "name",
        "name:en",
        "id",
        "operator",
        "addr:street",
        "addr:housenumber",
        "osm_id",
        "title",
        "tags",
        "label",
        "landuse",
        "opening_hours",
        "tourism",
    )
    self.keep = list(keep)
    if "keep" in config.config["keep"]:
        self.keep.extend(config.config["keep"])

    return title, extract

cleanData ¶

cleanData(data)

Filter out any data not in the data_model.

Parameters:

Name	Type	Description	Default
`data`	`bytes`	The input data or filespec to the input data file	required

Returns:

Type	Description
`FeatureCollection`	The modifed data

Source code in osm_fieldwork/filter_data.py

def cleanData(
    self,
    data,
):
    """Filter out any data not in the data_model.

    Args:
        data (bytes): The input data or filespec to the input data file

    Returns:
        (FeatureCollection): The modifed data

    """
    log.debug("Cleaning data...")
    if type(data) == str:
        outfile = open(f"new-{data}", "x")
        infile = open(tmpfile, "r")
        indata = geojson.load(infile)
    elif type(data) == bytes:
        indata = eval(data.decode())
    else:
        indata = data
    # these just create noise in the log file
    ignore = (
        "timestamp",
        "version",
        "changeset",
    )
    keep = ("osm_id", "id", "version")
    collection = list()
    for feature in indata["features"]:
        # log.debug(f"FIXME0: {feature}")
        properties = dict()
        for key, value in feature["properties"].items():
            # log.debug(f"{key} = {value}")
            # FIXME: this is a hack!
            if True:
                if key == "tags":
                    for k, v in value.items():
                        if k[:4] == "name":
                            properties["title"] = value[k]
                            properties["label"] = value[k]
                        else:
                            properties[k] = v
                else:
                    if key == "osm_id":
                        properties["id"] = value
                        properties["title"] = value
                        properties["label"] = value
                    else:
                        properties[key] = value
                        if key[:4] == "name":
                            properties["title"] = value
                            properties["label"] = value
            else:
                log.debug(f"FIXME2: {key} = {value}")
                if key in keep:
                    properties[key] = value
                    continue
                if key in self.tags:
                    if key == "name" or key == "name:en":
                        properties["title"] = self.tags[key]
                        properties["label"] = self.tags[key]
                    if value in self.tags[key]:
                        properties[key] = value
                    else:
                        if value != "yes":
                            log.warning(f"Value {value} not in the data model!")
                        continue
                else:
                    if key in ignore:
                        continue
                    log.warning(f"Tag {key} not in the data model!")
                    continue
        if "title" not in properties:
            properties["label"] = properties["id"]
            properties["title"] = properties["id"]
        newfeature = Feature(geometry=feature["geometry"], properties=properties)
        collection.append(newfeature)
    if type(data) == str:
        geojson.dump(FeatureCollection(collection), outfile)
    return FeatureCollection(collection)

options: show_source: false heading_level: 3