API Docs for osm-rawdata¶

config.py¶

Bases: object

Parse a config file into a data structure.

Parameters:

Name	Type	Description	Default
`boundary`	`Polygon`	The project boundary.	`None`

Source code in osm_rawdata/config.py

def __init__(self, boundary: Polygon = None):
    """Init the QueryConfig object.

    Args:
        boundary (Polygon): The project boundary.
    """
    self.config = {
        "select": {
            "nodes": [],
            "ways_poly": [],
            "ways_line": [],
            "relations": [],
        },
        "tables": [],
        "where": {
            "nodes": [],
            "ways_poly": [],
            "ways_line": [],
            "relations": [],
        },
        "keep": [],
    }
    self.geometry = boundary
    # for polygon extracts, sometimes we just want the center point
    self.centroid = False

parseYaml ¶

parseYaml(config)

Parse the YAML config file format into the internal data structure.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	the file or BytesIO object to read.	required

Returns:

Name	Type	Description
`config`	`dict`	The config data.

Source code in osm_rawdata/config.py

def parseYaml(self, config: Union[str, BytesIO]):  # noqa N802
    """Parse the YAML config file format into the internal data structure.

    Args:
        config (str, BytesIO): the file or BytesIO object to read.

    Returns:
        config (dict): The config data.
    """
    yaml_data = self.load_yaml(config)

    self._yaml_parse_tables(yaml_data)
    self._yaml_parse_where(yaml_data)
    self._yaml_parse_select_and_keep(yaml_data)
    self.config["keep"] = yaml_data.get("keep", [])

    return self.config

load_yaml `staticmethod` ¶

load_yaml(config)

Private method to load YAML data from a file.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	The disk or memory file to read.	required

Returns:

Name	Type	Description
`data`	`dict`	The loaded YAML data.

Source code in osm_rawdata/config.py

@staticmethod
def load_yaml(config: Union[str, BytesIO]):
    """Private method to load YAML data from a file.

    Args:
        config (str, BytesIO): The disk or memory file to read.

    Returns:
        data (dict): The loaded YAML data.
    """
    if isinstance(config, str):
        with open(config, "r") as file:
            return yaml.safe_load(file)
    elif isinstance(config, BytesIO):
        return yaml.safe_load(config.getvalue())
    else:
        log.error(f"Unsupported config format: {config}")
        raise ValueError(f"Invalid config {config}")

parseJson ¶

parseJson(config)

Parse the JSON format config file using the Underpass schema.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	the file or BytesIO object to read.	required

Returns:

Name	Type	Description
`config`	`dict`	the config data

Source code in osm_rawdata/config.py

def parseJson(self, config: Union[str, BytesIO]):  # noqa N802
    """Parse the JSON format config file using the Underpass schema.

    Args:
        config (str, BytesIO): the file or BytesIO object to read.

    Returns:
        config (dict): the config data
    """
    # Check the type of config and load data accordingly
    if isinstance(config, str):
        with open(config, "r") as config_file:
            data = json.load(config_file)
    elif isinstance(config, BytesIO):
        config.seek(0)  # Reset the file pointer to the beginning
        data = json.load(config)
    else:
        log.error(f"Unsupported config format: {config}")
        raise ValueError(f"Invalid config {config}")

    # Helper function to convert geometry names
    def convert_geometry(geom_type):
        if geom_type == "point":
            return "nodes"
        elif geom_type == "line":
            return "ways_line"
        elif geom_type == "polygon":
            return "ways_poly"
        return geom_type

    # Extract geometry
    if geom_dict := data.get("geometry"):
        self.geometry = shape(geom_dict)

    # Iterate through each key-value pair in the flattened dictionary
    for key, value in flatdict.FlatDict(data).items():
        keys = key.split(":")
        # Skip the keys related to geometry
        if key.startswith("geometry"):
            continue
        # If it's a top-level key, directly update self.config
        if len(keys) == 1:
            self.config[key] = value
            continue

        # Extract meaningful parts from the key
        section, subsection = keys[:2]
        geom_type = keys[2] if len(keys) > 2 else None
        tag_type = keys[3] if len(keys) > 3 else None
        tag_name = keys[4] if len(keys) > 4 else None

        # Convert geometry type to meaningful names
        geom_type = convert_geometry(geom_type)

        if subsection == "attributes":
            # For attributes, update select fields and tables
            for attribute_name in value:
                # FIXME needs a refactor to handle all_geometry correctly
                if geom_type == "all_geometry":
                    for geometry_type in ["nodes", "ways_line", "ways_poly"]:
                        self.config["select"][geometry_type].append(
                            {attribute_name: {}}
                        )
                        self.config["tables"].append(geometry_type)
                else:
                    self.config["select"][geom_type].append({attribute_name: {}})
                    self.config["tables"].append(geom_type)
        elif subsection == "tags":
            # For tags, update where fields
            option = tag_type[5:] if tag_type else None
            new_tag = {tag_name: value, "op": option}
            if geom_type == "all_geometry":
                for geometry_type in ["nodes", "ways_line", "ways_poly"]:
                    self.config["where"][geometry_type].append(new_tag)
            else:
                self.config["where"][geom_type].append(new_tag)

    return self.config

getKeys ¶

getKeys()

Source code in osm_rawdata/config.py

def getKeys(self):
    """ """
    keys = list()
    # The first column returned is always the geometry
    keys.append("geometry")
    for key, value in self.config["select"].items():
        if isinstance(value, list):
            for v in value:
                if isinstance(v, str):
                    # print(f"\tSelecting table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(k1)
                    # print(f"\tSelecting table '{key}' tag '{k1}'")
        # else:
        #     print(f"\tSelecting tag '{key}'")
    return keys

dump ¶

dump()

Dump the contents of the internal data strucute for debugging purposes.

Source code in osm_rawdata/config.py

def dump(self):
    """Dump the contents of the internal data strucute for debugging purposes."""
    print("Dumping QueryConfig class")

    # These two data items are only used by Export Tool for output files
    # for k, v in self.config.items():
    #     if k == 'nodes' or k == 'ways_poly' or k == 'ways_line' or k == 'keep' or k == 'tables' k ==:
    #         continue
    #     print(f"Other {k} is \'{v}\'")

    keys = list()
    for key, value in self.config["select"].items():
        if isinstance(value, list):
            for v in value:
                if isinstance(v, str):
                    print(f"\tSelecting table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(v1)
                    print(f"\tSelecting table '{key}' tag '{k1}'")
        else:
            print(f"\tSelecting tag '{key}'")
    # print(f"\tSelecting tag \'{key}\' \'{k1}\' has values \'{keys}\'")
    print("Where: ")
    for key, value in self.config["where"].items():
        if isinstance(value, list):
            for v in value:
                op = v["op"].upper()
                # del v['op']
                if isinstance(v, str):
                    print(f"\tWhere table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(v1)
                    if k1 == "op":
                        continue
                    print(
                        f"\tWhere table '{key}', tag '{k1}' has values '{v1}' {op}"
                    )
        else:
            print(f"\tSelecting tag '{key}'")
    # print("Tables")
    # for table in self.config['tables']:
    #    print(f"\t{table}")
    if self.geometry:
        print(self.geometry)

options: show_source: false heading_level: 3

postgres.py¶

Bases: object

Parameters:

Name	Type	Description	Default
`dburi`	`str`	The URI string for the database connection	required

Source code in osm_rawdata/postgres.py

def __init__(
    self,
    dburi: str,
):
    """This is a class to setup a database connection.

    Args:
        dburi (str): The URI string for the database connection
    """
    self.dbshell = None
    self.dbcursor = None
    self.uri = uriParser(dburi)
    if self.uri["dbname"] == "underpass":
        # Use a persistant connect, better for multiple requests
        self.session = requests.Session()
        self.uri = os.getenv(
            "RAW_DATA_API_URL", "https://api-prod.raw-data.hotosm.org/v1"
        )
        self.headers = {
            "accept": "application/json",
            "Content-Type": "application/json",
        }
    else:
        log.info(f"Opening database connection to: {self.uri['dbname']}")
        connect = "PG: dbname=" + self.uri["dbname"]
        if "dbname" in self.uri and self.uri["dbname"] is not None:
            connect = f"dbname={self.uri['dbname']}"
        elif (
            "dbhost" in self.uri
            and self.uri["dbhost"] == "localhost"
            and self.uri["dbhost"] is not None
        ):
            connect = f"host={self.uri['dbhost']} dbname={self.uri['dbname']}"
        if "dbuser" in self.uri and self.uri["dbuser"] is not None:
            connect += f" user={self.uri['dbuser']}"
        if "dbpass" in self.uri and self.uri["dbpass"] is not None:
            connect += f" password={self.uri['dbpass']}"
        # log.debug(f"Connecting with: {connect}")
        try:
            self.dbshell = psycopg2.connect(connect)
            self.dbshell.autocommit = True
            self.dbcursor = self.dbshell.cursor()
            if self.dbcursor.closed != 0:
                log.error(f"Couldn't open cursor in {self.uri['dbname']}")
        except Exception as e:
            log.error(f"Couldn't connect to database: {e}")

createJson ¶

createJson(config, boundary, allgeom=False, extra_params={})

Generate a JSON file used for remote access to raw-data-api.

Uses the Underpass schema.

Parameters:

Name	Type	Description	Default
`config`	`QueryConfig`	The config data from the query config file	required
`boundary`	`Polygon`	The boundary polygon	required
`allgeom`	`bool`	Whether to return centroids or all the full geometry TODO this is not implemented.	`False`
`extra_params`	`dict`	Extra parameters to include in JSON config root. These params override existing values if set.	`{}`

Returns:

Name	Type	Description
`str`	`str`	The stringified JSON data.

Source code in osm_rawdata/postgres.py

def createJson(
    self,
    config: QueryConfig,
    boundary: GeojsonPolygon,
    allgeom: bool = False,
    extra_params: dict = {},
) -> str:
    """Generate a JSON file used for remote access to raw-data-api.

    Uses the Underpass schema.

    Args:
        config (QueryConfig): The config data from the query config file
        boundary (GeojsonPolygon): The boundary polygon
        allgeom (bool): Whether to return centroids or all the full geometry
            TODO this is not implemented.
        extra_params (dict): Extra parameters to include in JSON config root.
            These params override existing values if set.

    Returns:
        str: The stringified JSON data.
    """
    json_data = {
        "geometry": boundary,
        "geometryType": self._get_geometry_types(config),
        "filters": self._get_filters(config),
        "centroid": config.config.get("centroid", False),
        "attributes": self._get_attributes(config),
        **extra_params,
    }

    return json.dumps(json_data)

createSQL ¶

createSQL(config, allgeom=True)

This class generates the SQL to query a local postgres database.

Parameters:

Name	Type	Description	Default
`config`	`QueryConfig`	The config data from the query config file	required
`allgeom`	`bool`	Whether to return centroids or all the full geometry	`True`

Returns:

Type	Description
`FeatureCollection`	the json

Source code in osm_rawdata/postgres.py

def createSQL(
    self,
    config: QueryConfig,
    allgeom: bool = True,
):
    """This class generates the SQL to query a local postgres database.

    Args:
        config (QueryConfig): The config data from the query config file
        allgeom (bool): Whether to return centroids or all the full geometry

    Returns:
        (FeatureCollection): the json
    """
    sql = list()
    query = ""
    for table in config.config["tables"]:
        select = "SELECT "
        if allgeom:
            select += "ST_AsText(geom) AS geometry"
        else:
            select += "ST_AsText(ST_Centroid(geom)) AS geometry"
        select += ", osm_id, version, "
        for entry in config.config["select"][table]:
            for k1, v1 in entry.items():
                if k1 == "osm_id" or k1 == "version":
                    continue
                select += f"tags->>'{k1}', "
        select = select[:-2]

        # If a way, we need the refs for conflating with JOSM
        if table == "ways_poly":
            select += ", refs "

        join_or = list()
        join_and = list()
        for entry in config.config["where"][table]:
            # print(entry)
            if "op" not in entry:
                pass
            op = entry["op"]
            for k, v in entry.items():
                if k == "op":
                    continue
                if op == "or":
                    # print(f"1: {k}=\'{v}\' OR ")
                    join_or.append(entry)
                elif op == "and":
                    # print(f"2: {k}=\'{v}\' AND ")
                    join_and.append(entry)
        # jor = '('
        jor = ""
        for entry in join_or:
            for k, v in entry.items():
                # Check if v is a non-empty list
                if isinstance(v, list) and v:
                    if isinstance(v[0], list):
                        # It's an array of values
                        value = str(v[0])
                        any = f"ANY(ARRAY{value})"
                        jor += f"tags->>'{k}'={any} OR "
                        continue
                if k == "op":
                    continue
                if len(v) == 1:
                    if v[0] == "not null":
                        v1 = "IS NOT NULL"
                    else:
                        v1 = f"='{v[0]}'"
                elif len(v) > 0:
                    v1 = f" IN {str(tuple(v))}"
                else:
                    v1 = "IS NOT NULL"
                jor += f"tags->>'{k}' {v1} OR "
        # print(f"JOR: {jor}")

        jand = ""
        for entry in join_and:
            for k, v in entry.items():
                if k == "op":
                    continue
                if len(v) == 1:
                    if v[0] == "not null":
                        v1 = "IS NOT NULL"
                    else:
                        v1 = f"='{v[0]}'"
                elif len(v) > 0:
                    v1 = f" IN {str(tuple(v))}"
                else:
                    v1 = "IS NOT NULL AND"
                jand += f"tags->>'{k}' {v1} AND "
        # print(f"JAND: {jand}")
        query = f"{select} FROM {table} WHERE {jor} {jand}".rstrip()
        # if query[len(query)-5:] == ' OR  ':
        # print(query[:query.rfind(' ')])
        sql.append(query[: query.rfind(" ")])

    return sql

createTable ¶

createTable(sql)

Create a table in the database

Parameters:

Name	Type	Description	Default
`sql`	`str`	The SQL	required

Returns:

Type	Description
`bool`	The table creation status

Source code in osm_rawdata/postgres.py

def createTable(
    self,
    sql: str,
):
    """Create a table in the database

    Args:
        sql (str): The SQL

    Returns:
        (bool): The table creation status
    """
    log.info("Creating table schema")
    result = self.dbcursor.execute(sql)

    return True

execute ¶

execute(sql)

Execute a raw SQL query and return the results.

Parameters:

Name	Type	Description	Default
`sql`	`str`	The SQL to execute	required

Returns:

Type	Description
`list`	The results of the query

Source code in osm_rawdata/postgres.py

def execute(
    self,
    sql: str,
):
    """Execute a raw SQL query and return the results.

    Args:
        sql (str): The SQL to execute

    Returns:
        (list): The results of the query
    """
    # print(sql)
    try:
        result = self.dbcursor.execute(sql)
        return self.dbcursor.fetchall()
    except:
        log.error(f"Couldn't execute query! {sql}")
        return list()

queryLocal ¶

queryLocal(query, allgeom=True, boundary=None)

This query a local postgres database.

Parameters:

Name	Type	Description	Default
`query`	`str`	The SQL query to execute	required
`allgeom`	`bool`	Whether to return centroids or all the full geometry	`True`
`boundary`	`Polygon`	The boundary polygon	`None`

Returns:

Name	Type	Description
`query`	`FeatureCollection`	the results of the query

Source code in osm_rawdata/postgres.py

def queryLocal(
    self,
    query: str,
    allgeom: bool = True,
    boundary: Polygon = None,
):
    """This query a local postgres database.

    Args:
        query (str): The SQL query to execute
        allgeom (bool): Whether to return centroids or all the full geometry
        boundary (Polygon): The boundary polygon

    Returns:
            query (FeatureCollection): the results of the query
    """
    features = list()
    # if no boundary, it's already been setup
    if boundary:
        sql = f"DROP VIEW IF EXISTS ways_view;CREATE VIEW ways_view AS SELECT * FROM ways_poly WHERE ST_CONTAINS(ST_GeomFromEWKT('SRID=4326;{boundary.wkt}'), geom)"
        self.dbcursor.execute(sql)
        sql = f"DROP VIEW IF EXISTS nodes_view;CREATE VIEW nodes_view AS SELECT * FROM nodes WHERE ST_CONTAINS(ST_GeomFromEWKT('SRID=4326;{boundary.wkt}'), geom)"
        self.dbcursor.execute(sql)
        sql = f"DROP VIEW IF EXISTS lines_view;CREATE VIEW lines_view AS SELECT * FROM ways_line WHERE ST_CONTAINS(ST_GeomFromEWKT('SRID=4326;{boundary.wkt}'), geom)"
        self.dbcursor.execute(sql)
        sql = f"DROP VIEW IF EXISTS relations_view;CREATE TEMP VIEW relations_view AS SELECT * FROM nodes WHERE ST_CONTAINS(ST_GeomFromEWKT('SRID=4326;{boundary.wkt}'), geom)"
        self.dbcursor.execute(sql)

        if query.find(" ways_poly ") > 0:
            query = query.replace("ways_poly", "ways_view")
        elif query.find(" ways_line ") > 0:
            query = query.replace("ways_line", "lines_view")
        elif query.find(" nodes ") > 0:
            query = query.replace("nodes", "nodes_view")
        elif query.find(" relations ") > 0:
            query = query.replace("relations", "relations_view")

    # log.debug(query)
    self.dbcursor.execute(query)
    try:
        result = self.dbcursor.fetchall()
        # log.debug("SQL Query returned %d records" % len(result))
    except:
        return FeatureCollection(features)

    # If there is no config file, don't modify the results
    if (
        len(self.qc.config["where"]["ways_poly"]) == 0
        and len(self.qc.config["where"]["nodes"]) == 0
    ):
        return result

    for item in result:
        if len(item) <= 1 and len(result) == 1:
            return result
            # break
        # print(f"{item}")
        tags = dict()
        geom = wkt.loads(item[0])
        # tags["id"] = item[1]
        tags["version"] = item[2]
        if query.find(" refs ") > 0:
            tags["refs"] = str(item[len(item) - 1])
            # breakpoint()
        i = 3
        # Figure out the tags from the SELECT part of the query
        keys = query.replace(",", "").replace("tags->>", "").replace("'", "")
        end = keys.find("FROM")
        res = keys[:end].split(" ")
        # This should be the geometry
        geom = wkt.loads(item[0])
        for i in range(2, len(item)):
            # print(f"{res[i]} = {item[i - 1]}")
            if item[i - 1] is None:
                continue
            tags[res[i]] = item[i - 1]
        features.append(Feature(geometry=geom, properties=tags))
    return FeatureCollection(features)

queryRemote ¶

queryRemote(query)

This queries a remote postgres database using the FastAPI backend to the HOT Export Tool.

Parameters:

Name	Type	Description	Default
`query`	`str`	The JSON query to execute.	required

Returns:

Type	Description
`(str, FeatureCollection, BytesIO)`	either the data URL if bind_zip=False, extracted geojson, else BytesIO file. Returns None on failure.

Source code in osm_rawdata/postgres.py

def queryRemote(
    self,
    query: str,
) -> Optional[Union[str, dict, BytesIO]]:
    """This queries a remote postgres database using the FastAPI
    backend to the HOT Export Tool.

    Args:
        query (str): The JSON query to execute.

    Returns:
        (str, FeatureCollection, BytesIO): either the data URL if bind_zip=False,
            extracted geojson, else BytesIO file. Returns None on failure.
    """
    # Send the request to raw data api
    result = None

    url = f"{self.uri}/snapshot/"
    try:
        log.debug(f"Raw Data API snapshot JSON config: {query}")
        result = self.session.post(url, data=query, headers=self.headers)
        result.raise_for_status()
    except requests.exceptions.HTTPError:
        if result is not None:
            error_dict = result.json()
            error_dict["status_code"] = result.status_code
            log.error(f"Failed to get extract from Raw Data API: {error_dict}")
            return None
        else:
            log.error("Failed to make request to raw data API")

    if result is None:
        log.error("Raw Data API did not return a response. Skipping.")
        return None

    if result.status_code != 200:
        error_message = result.json().get("detail")[0].get("msg")
        log.error(f"{error_message}")
        return None

    task_id = result.json().get("task_id")
    task_query_url = f"{self.uri}/tasks/status/{task_id}"
    log.debug(f"Raw Data API Query URL: {task_query_url}")

    polling_interval = 2  # Initial polling interval in seconds
    max_polling_duration = (
        600  # Maximum duration for polling in seconds (10 minutes)
    )
    elapsed_time = 0

    while elapsed_time < max_polling_duration:
        response = self.session.get(task_query_url, headers=self.headers)
        response_json = response.json()
        response_status = response_json.get("status")
        task_info = response_json.get("result", {})

        log.debug(f"Current status: {response_status}")

        # First check to see if FAILURE and stop polling
        if response_status == "FAILURE":
            # NOTE bug we must override task_info as it's set to a string
            task_info = {}
            log.error(f"Raw-data-api task FAILURE. Details: {task_query_url}")
            break

        # response_status options: STARTED, PENDING, SUCCESS
        if (
            response_status != "SUCCESS"
            or not isinstance(task_info, dict)
            or not task_info.get("download_url")
        ):
            # Adjust polling frequency after the first minute
            if elapsed_time > 60:
                polling_interval = (
                    10  # Poll every 10 seconds after the first minute
                )

            # Wait before polling again
            log.debug(
                f"Waiting {polling_interval} seconds before polling API again..."
            )
            time.sleep(polling_interval)
            elapsed_time += polling_interval

        else:
            # response_status="SUCCESS" and download_url present
            break

    else:
        # Maximum polling duration reached
        log.error(f"{max_polling_duration} second elapsed. Aborting data extract.")
        return None

    log.debug(f"Raw Data API Response: {task_info}")
    data_url = task_info.get("download_url")

    if not data_url:
        log.error("Raw data api no download_url returned. Skipping.")
        return None

    if not data_url.endswith(".zip"):
        return data_url

    # Extract filename is set, else use RawExport.geojson
    query_dict = json.loads(query)
    file_type = query_dict.get("outputType", "geojson")
    filename = f"{query_dict.get('fileName', 'RawExport')}.{file_type}"
    # Get zip file and extract
    with self.session.get(data_url, headers=self.headers) as response:
        buffer = BytesIO(response.content)
        with zipfile.ZipFile(buffer, "r") as zipped_file:
            with zipped_file.open(filename) as extracted_data:
                if file_type == "geojson":
                    return json.load(extracted_data)
                else:
                    return BytesIO(extracted_data.read())

options: show_source: false heading_level: 3

Bases: DatabaseAccess

Class to handle SQL queries for the categories.

Parameters:

Name	Type	Description	Default
`uri`	`str`	The URI string for the database connection.	required
`config`	`(str, BytesIO)`	The query config file path or BytesIO object. Currently only YAML format is accepted if BytesIO is passed.	`None`

Returns:

Type	Description
`bool`	Whether the data base connection was sucessful

Source code in osm_rawdata/postgres.py

def __init__(
    self,
    uri: str,
    config: Optional[Union[str, BytesIO]] = None,
    auth_token: Optional[str] = None,
    # output: str = None
):
    """This is a client for a postgres database.

    Args:
        uri (str): The URI string for the database connection.
        config (str, BytesIO): The query config file path or BytesIO object.
            Currently only YAML format is accepted if BytesIO is passed.

    Returns:
        (bool): Whether the data base connection was sucessful
    """
    super().__init__(uri)
    self.qc = QueryConfig()

    # Optional authentication
    if auth_token:
        self.headers["access-token"] = auth_token

    if config:
        # filespec string passed
        if isinstance(config, str):
            path = Path(config)
            if not path.exists():
                raise FileNotFoundError(f"Config file does not exist {config}")
            with open(config, "rb") as config_file:
                config_data = BytesIO(config_file.read())
            if path.suffix == ".json":
                config_type = "json"
            elif path.suffix == ".yaml":
                config_type = "yaml"
            else:
                log.error(f"Unsupported file format: {config}")
                raise ValueError(f"Invalid config {config}")

        # BytesIO object passed
        elif isinstance(config, BytesIO):
            config.seek(0)  # Reset the file pointer to the beginning
            config_data = config
            try:
                # Is JSON
                json.load(config_data)
                log.debug("Parsed config is JSON format")
                config_type = "json"
            except json.JSONDecodeError as e:
                log.error(e)
                # Is YAML
                log.debug("Parsed config is YAML format")
                config_type = "yaml"

        else:
            log.warning(f"Config input is invalid for PostgresClient: {config}")
            raise ValueError(f"Invalid config {config}")

        # Parse the config
        if config_type == "json":
            self.qc.parseJson(config_data)
        elif config_type == "yaml":
            self.qc.parseYaml(config_data)

createDB ¶

createDB(dburi)

Setup the postgres database connection.

Parameters:

Name	Type	Description	Default
`dburi`	`str`	The URI string for the database connection	required

Returns:

Name	Type	Description
`status`	`bool`	Whether the data base connection was sucessful

Source code in osm_rawdata/postgres.py

def createDB(self, dburi: uriParser):
    """Setup the postgres database connection.

    Args:
        dburi (str): The URI string for the database connection

    Returns:
        status (bool): Whether the data base connection was sucessful
    """
    sql = f"CREATE DATABASE IF NOT EXISTS {self.dbname}"
    self.dbcursor.execute(sql)
    result = self.dbcursor.fetchall()
    log.info("Query returned %d records" % len(result))
    # result = subprocess.call("createdb", uri.dbname)

    # Add the extensions needed
    sql = "CREATE EXTENSION postgis; CREATE EXTENSION hstore;"
    self.dbcursor.execute(sql)
    result = self.dbcursor.fetchall()
    log.info("Query returned %d records" % len(result))
    return True

execQuery ¶

execQuery(boundary, customsql=None, allgeom=True, extra_params={})

This class generates executes the query using a local postgres database, or a remote one that uses the Underpass schema.

Parameters:

Name	Type	Description	Default
`boundary`	`(FeatureCollection, Feature, dict, str)`	The boundary polygon.	required
`customsql`	`str`	Don't create the SQL, use the one supplied.	`None`
`allgeom`	`bool`	Whether to return centroids or all the full geometry.	`True`

Returns:

Name	Type	Description
`query`	`FeatureCollection`	the json

Source code in osm_rawdata/postgres.py

def execQuery(
    self,
    boundary: Union[FeatureCollection, Feature, dict, str],
    customsql: str = None,
    allgeom: bool = True,
    extra_params: dict = {},
):
    """This class generates executes the query using a local postgres
    database, or a remote one that uses the Underpass schema.

    Args:
        boundary (FeatureCollection, Feature, dict, str): The boundary polygon.
        customsql (str): Don't create the SQL, use the one supplied.
        allgeom (bool): Whether to return centroids or all the full geometry.

    Returns:
            query (FeatureCollection): the json
    """
    log.info("Parsing AOI geojson for data extract")

    # Parse JSON string type
    if isinstance(boundary, str):
        boundary = json.loads(boundary)

    # If multiple geoms are passed, unary_union them
    if (geom_type := boundary.get("type")) == "FeatureCollection":
        # Convert each feature into a Shapely geometry
        geometries = [
            shape(feature.get("geometry"))
            for feature in boundary.get("features", [])
        ]
        merged_geom = (
            unary_union(geometries) if len(geometries) > 1 else geometries[0]
        )
    elif geom_type == "Feature":
        merged_geom = shape(boundary.get("geometry"))
    else:
        merged_geom = shape(boundary)

    if self.dbshell:
        # If a multipolygon is passed, attempt a merge
        if isinstance(merged_geom, MultiPolygon):
            aoi_shape = MultiPolygon(
                [Polygon(poly.exterior) for poly in merged_geom.geoms]
            )
        elif isinstance(merged_geom, Polygon):
            aoi_shape = Polygon(merged_geom.exterior)

        log.info("Extracting features from Postgres...")
        if not customsql:
            sql = self.createSQL(self.qc, allgeom)
        else:
            sql = [customsql]
        alldata = list()
        for query in sql:
            # print(query)
            result = self.queryLocal(query, allgeom, aoi_shape)
            if len(result) > 0:
                alldata += result["features"]
        collection = FeatureCollection(alldata)
    else:
        log.info("Extracting features via remote call...")
        json_config = self.createJson(
            self.qc, mapping(merged_geom), allgeom, extra_params
        )
        collection = self.queryRemote(json_config)
        # bind_zip=False, data is not zipped, return URL directly
        if not json.loads(json_config).get("bind_zip", True):
            return collection

    if not collection:
        log.warning("No data returned for data extract")

    return collection

options: show_source: false heading_level: 3

Bases: object

Source code in osm_rawdata/geofabrik.py

def __init__(self):
    # find the path to the test data files
    filespec = f"{rootdir}/geofabrik.yaml"
    try:
        file = open(filespec, "rb").read()
    except Exception as e:
        print(sys.argv)
        log.error(f"Couldn't open {filespec}: {e}")
        quit()
    self.regions = yaml.load(file, Loader=yaml.Loader)

options: show_source: false heading_level: 3

Bases: object

Parse a config file into a data structure.

Parameters:

Name	Type	Description	Default
`boundary`	`Polygon`	The project boundary.	`None`

Source code in osm_rawdata/config.py

def __init__(self, boundary: Polygon = None):
    """Init the QueryConfig object.

    Args:
        boundary (Polygon): The project boundary.
    """
    self.config = {
        "select": {
            "nodes": [],
            "ways_poly": [],
            "ways_line": [],
            "relations": [],
        },
        "tables": [],
        "where": {
            "nodes": [],
            "ways_poly": [],
            "ways_line": [],
            "relations": [],
        },
        "keep": [],
    }
    self.geometry = boundary
    # for polygon extracts, sometimes we just want the center point
    self.centroid = False

parseYaml ¶

parseYaml(config)

Parse the YAML config file format into the internal data structure.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	the file or BytesIO object to read.	required

Returns:

Name	Type	Description
`config`	`dict`	The config data.

Source code in osm_rawdata/config.py

def parseYaml(self, config: Union[str, BytesIO]):  # noqa N802
    """Parse the YAML config file format into the internal data structure.

    Args:
        config (str, BytesIO): the file or BytesIO object to read.

    Returns:
        config (dict): The config data.
    """
    yaml_data = self.load_yaml(config)

    self._yaml_parse_tables(yaml_data)
    self._yaml_parse_where(yaml_data)
    self._yaml_parse_select_and_keep(yaml_data)
    self.config["keep"] = yaml_data.get("keep", [])

    return self.config

load_yaml `staticmethod` ¶

load_yaml(config)

Private method to load YAML data from a file.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	The disk or memory file to read.	required

Returns:

Name	Type	Description
`data`	`dict`	The loaded YAML data.

Source code in osm_rawdata/config.py

@staticmethod
def load_yaml(config: Union[str, BytesIO]):
    """Private method to load YAML data from a file.

    Args:
        config (str, BytesIO): The disk or memory file to read.

    Returns:
        data (dict): The loaded YAML data.
    """
    if isinstance(config, str):
        with open(config, "r") as file:
            return yaml.safe_load(file)
    elif isinstance(config, BytesIO):
        return yaml.safe_load(config.getvalue())
    else:
        log.error(f"Unsupported config format: {config}")
        raise ValueError(f"Invalid config {config}")

parseJson ¶

parseJson(config)

Parse the JSON format config file using the Underpass schema.

Parameters:

Name	Type	Description	Default
`config`	`(str, BytesIO)`	the file or BytesIO object to read.	required

Returns:

Name	Type	Description
`config`	`dict`	the config data

Source code in osm_rawdata/config.py

def parseJson(self, config: Union[str, BytesIO]):  # noqa N802
    """Parse the JSON format config file using the Underpass schema.

    Args:
        config (str, BytesIO): the file or BytesIO object to read.

    Returns:
        config (dict): the config data
    """
    # Check the type of config and load data accordingly
    if isinstance(config, str):
        with open(config, "r") as config_file:
            data = json.load(config_file)
    elif isinstance(config, BytesIO):
        config.seek(0)  # Reset the file pointer to the beginning
        data = json.load(config)
    else:
        log.error(f"Unsupported config format: {config}")
        raise ValueError(f"Invalid config {config}")

    # Helper function to convert geometry names
    def convert_geometry(geom_type):
        if geom_type == "point":
            return "nodes"
        elif geom_type == "line":
            return "ways_line"
        elif geom_type == "polygon":
            return "ways_poly"
        return geom_type

    # Extract geometry
    if geom_dict := data.get("geometry"):
        self.geometry = shape(geom_dict)

    # Iterate through each key-value pair in the flattened dictionary
    for key, value in flatdict.FlatDict(data).items():
        keys = key.split(":")
        # Skip the keys related to geometry
        if key.startswith("geometry"):
            continue
        # If it's a top-level key, directly update self.config
        if len(keys) == 1:
            self.config[key] = value
            continue

        # Extract meaningful parts from the key
        section, subsection = keys[:2]
        geom_type = keys[2] if len(keys) > 2 else None
        tag_type = keys[3] if len(keys) > 3 else None
        tag_name = keys[4] if len(keys) > 4 else None

        # Convert geometry type to meaningful names
        geom_type = convert_geometry(geom_type)

        if subsection == "attributes":
            # For attributes, update select fields and tables
            for attribute_name in value:
                # FIXME needs a refactor to handle all_geometry correctly
                if geom_type == "all_geometry":
                    for geometry_type in ["nodes", "ways_line", "ways_poly"]:
                        self.config["select"][geometry_type].append(
                            {attribute_name: {}}
                        )
                        self.config["tables"].append(geometry_type)
                else:
                    self.config["select"][geom_type].append({attribute_name: {}})
                    self.config["tables"].append(geom_type)
        elif subsection == "tags":
            # For tags, update where fields
            option = tag_type[5:] if tag_type else None
            new_tag = {tag_name: value, "op": option}
            if geom_type == "all_geometry":
                for geometry_type in ["nodes", "ways_line", "ways_poly"]:
                    self.config["where"][geometry_type].append(new_tag)
            else:
                self.config["where"][geom_type].append(new_tag)

    return self.config

getKeys ¶

getKeys()

Source code in osm_rawdata/config.py

def getKeys(self):
    """ """
    keys = list()
    # The first column returned is always the geometry
    keys.append("geometry")
    for key, value in self.config["select"].items():
        if isinstance(value, list):
            for v in value:
                if isinstance(v, str):
                    # print(f"\tSelecting table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(k1)
                    # print(f"\tSelecting table '{key}' tag '{k1}'")
        # else:
        #     print(f"\tSelecting tag '{key}'")
    return keys

dump ¶

dump()

Dump the contents of the internal data strucute for debugging purposes.

Source code in osm_rawdata/config.py

def dump(self):
    """Dump the contents of the internal data strucute for debugging purposes."""
    print("Dumping QueryConfig class")

    # These two data items are only used by Export Tool for output files
    # for k, v in self.config.items():
    #     if k == 'nodes' or k == 'ways_poly' or k == 'ways_line' or k == 'keep' or k == 'tables' k ==:
    #         continue
    #     print(f"Other {k} is \'{v}\'")

    keys = list()
    for key, value in self.config["select"].items():
        if isinstance(value, list):
            for v in value:
                if isinstance(v, str):
                    print(f"\tSelecting table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(v1)
                    print(f"\tSelecting table '{key}' tag '{k1}'")
        else:
            print(f"\tSelecting tag '{key}'")
    # print(f"\tSelecting tag \'{key}\' \'{k1}\' has values \'{keys}\'")
    print("Where: ")
    for key, value in self.config["where"].items():
        if isinstance(value, list):
            for v in value:
                op = v["op"].upper()
                # del v['op']
                if isinstance(v, str):
                    print(f"\tWhere table '{key}' has value '{v}'")
                    keys.append(v)
                    continue
                for k1, v1 in v.items():
                    keys.append(v1)
                    if k1 == "op":
                        continue
                    print(
                        f"\tWhere table '{key}', tag '{k1}' has values '{v1}' {op}"
                    )
        else:
            print(f"\tSelecting tag '{key}'")
    # print("Tables")
    # for table in self.config['tables']:
    #    print(f"\t{table}")
    if self.geometry:
        print(self.geometry)

options: show_source: false heading_level: 3

importer.py¶

Bases: object

Parameters:

Name	Type	Description	Default
`dburi`	`str`	The URI string for the database connection	required

Returns:

Type	Description
`OsmImporter`	An instance of this class

Source code in osm_rawdata/importer.py

def __init__(
    self,
    dburi: str,
):
    """This is a class to setup a local database for OSM data.

    Args:
        dburi (str): The URI string for the database connection

    Returns:
        (OsmImporter): An instance of this class
    """
    self.dburi = dburi
    self.db = None
    self.connections = list()
    for thread in range(0, cores + 1):
        engine = create_engine(f"postgresql://{self.dburi}", echo=False)
        if not database_exists(engine.url):
            create_database(engine.url)
        self.connections.append(engine.connect())
        sessionmaker(autocommit=False, autoflush=False, bind=engine)

        if thread == 0:
            meta = MetaData()
            meta.create_all(engine)

            # if dburi:
            # self.uri = uriParser(dburi)
            # engine = create_engine(f"postgresql://{self.dburi}", echo=True)
            # if not database_exists(engine.url):
            #     create_database(engine.url)
            # self.db = engine.connect()

            # Add the extension we need to process the data
            sql = text(
                "CREATE EXTENSION IF NOT EXISTS postgis; CREATE EXTENSION IF NOT EXISTS hstore;CREATE EXTENSION IF NOT EXISTS dblink;"
            )
            self.connections[0].execute(sql)

            Base.metadata.create_all(bind=engine)

            sessionmaker(autocommit=False, autoflush=False, bind=engine)

importOSM ¶

importOSM(infile)

Import an OSM data file into a postgres database.

Parameters:

Name	Type	Description	Default
`infile`	`str`	The file to import	required

Returns: (bool): Whether the import finished sucessfully

Source code in osm_rawdata/importer.py

def importOSM(
    self,
    infile: str,
):
    """Import an OSM data file into a postgres database.

    Args:
        infile (str): The file to import
    Returns:
        (bool): Whether the import finished sucessfully
    """
    # osm2pgsql --create -d nigeria --extra-attributes --output=flex --style raw.lua nigeria-latest-internal.osm.pbf
    uri = uriParser(self.dburi)
    result = subprocess.run(
        [
            "osm2pgsql",
            "--create",
            "-d",
            f"{uri['dbname']}",
            "--extra-attributes",
            "--output=flex",
            "--style",
            f"{rootdir}/import/raw.lua",
            f"{infile}",
        ]
    )
    result.check_returncode()

importParquet ¶

importParquet(infile)

Import an Overture parquet data file into a postgres database.

Parameters:

Name	Type	Description	Default
`infile`	`str`	The file to import	required

Returns:

Type	Description
`bool`	Whether the import finished sucessfully

Source code in osm_rawdata/importer.py

def importParquet(
    self,
    infile: str,
):
    """Import an Overture parquet data file into a postgres database.

    Args:
        infile (str): The file to import

    Returns:
        (bool): Whether the import finished sucessfully
    """
    # spin = PixelSpinner(f"Processing {infile}...")
    timer = Timer(text="importParquet() took {seconds:.0f}s")
    timer.start()
    overture = Overture(infile)

    connections = list()
    for thread in range(0, cores + 1):
        engine = create_engine(f"postgresql://{self.dburi}", echo=False)
        if not database_exists(engine.url):
            create_database(engine.url)
        connections.append(engine.connect())
        sessionmaker(autocommit=False, autoflush=False, bind=engine)

        if thread == 0:
            meta = MetaData()
            meta.create_all(engine)

    # A chunk is a group of threads
    entries = len(overture.data)
    log.debug(f"There are {entries} entries in {infile}")
    chunk = round(entries / cores)

    if entries <= chunk:
        result = parquetThread(overture.data, connections[0])
        timer.stop()
        return True

    index = 0
    with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
        block = 0
        while block <= entries:
            if len(overture.data[block : block + chunk]) == 0:
                continue
            log.debug("Dispatching Block %d:%d" % (block, block + chunk))
            result = executor.submit(
                parquetThread,
                overture.data[block : block + chunk],
                connections[index],
            )
            block += chunk
            index += 1
        executor.shutdown()
    timer.stop()

importGeoJson ¶

importGeoJson(infile)

Import a GeoJson data file into a postgres database.

Parameters:

Name	Type	Description	Default
`infile`	`str`	The file to import	required

Returns:

Type	Description
`bool`	Whether the import finished sucessfully

Source code in osm_rawdata/importer.py

def importGeoJson(
    self,
    infile: str,
):
    """Import a GeoJson data file into a postgres database.

    Args:
        infile (str): The file to import

    Returns:
        (bool): Whether the import finished sucessfully
    """
    # load the GeoJson file
    file = open(infile, "r")
    # size = os.path.getsize(infile)
    # for line in file.readlines():
    #    print(line)
    data = geojson.load(file)

    future = None
    result = None
    index = 0
    connections = list()

    timer = Timer(text="importGeoJson() took {seconds:.0f}s")
    timer.start()

    # A chunk is a group of threads
    entries = len(data["features"])
    chunk = round(entries / cores)

    # For small files we only need one thread
    if entries <= chunk:
        result = importThread(data["features"], self.connections[0])
        timer.stop()
        return True

    with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
        block = 0
        while block <= entries:
            log.debug("Dispatching Block %d:%d" % (block, block + chunk))
            result = executor.submit(
                importThread,
                data["features"][block : block + chunk],
                self.connections[index],
            )
            block += chunk
            index += 1
        executor.shutdown()
    timer.stop()

    return True

options: show_source: false heading_level: 3

overture.py¶

Bases: object

Parameters:

Name	Type	Description	Default
`data`	`list`	The list of features	required

Source code in osm_rawdata/overture.py

def __init__(
    self,
    filespec: str = None,
):
    """A class for parsing Overture V2 files.

    Args:
        data (list): The list of features
    """
    # pfile = pq.ParquetFile(filespec)
    # self.data = pfile.read()
    if filespec:
        try:
            self.data = pd.read_parquet(filespec)
            log.debug(f"Read {len(self.data)} entries from {filespec}")
        except:
            log.error(f"Couldn't read data from {filespec}!")
    self.filespec = filespec

options: show_source: false heading_level: 3

API Docs for osm-rawdata¶

config.py¶

parseYaml ¶

load_yaml staticmethod ¶

parseJson ¶

getKeys ¶

dump ¶

postgres.py¶

createJson ¶

createSQL ¶

createTable ¶

execute ¶

queryLocal ¶

queryRemote ¶

createDB ¶

execQuery ¶

parseYaml ¶

load_yaml staticmethod ¶

parseJson ¶

getKeys ¶

dump ¶

importer.py¶

importOSM ¶

importParquet ¶

importGeoJson ¶

overture.py¶

load_yaml `staticmethod` ¶

load_yaml `staticmethod` ¶