Skip to content

API Docs for conflator

config.py

ConflateBuildings

ConflateBuildings(dburi=None, boundary=None)

Bases: object

Parameters:

Name Type Description Default
dburi str

The DB URI

None
boundary Polygon

The AOI of the project

None

Returns:

Type Description
ConflateDB

An instance of this object

Source code in conflator/conflateBuildings.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def __init__(self,
             dburi: str = None,
             boundary: Polygon = None,
             ):
    """
    This class conflates data that has been imported into a postgres
    database using the Underpass raw data schema.

    Args:
        dburi (str): The DB URI
        boundary (Polygon): The AOI of the project

    Returns:
        (ConflateDB): An instance of this object
    """
    self.postgres = list()
    self.uri = None
    if dburi:
        self.uri = uriParser(dburi)
        self.db = GeoSupport(dburi)
    self.boundary = boundary
    self.view = "ways_poly"
    self.filter = list()

addSourceFilter

addSourceFilter(source)

Add to a list of suspect bad source datasets

Source code in conflator/conflateBuildings.py
73
74
75
76
77
78
79
80
def addSourceFilter(self,
                    source: str,
                    ):
    """
    Add to a list of suspect bad source datasets

    """
    self.filter.append(source)

overlapDB

overlapDB(dburi)

Conflate buildings where all the data is in the same postgres database using the Underpass raw data schema.

Parameters:

Name Type Description Default
dburi str

The URI for the existing OSM data

required

This is not fast for large areas!

Source code in conflator/conflateBuildings.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def overlapDB(self,
            dburi: str,
            ):
    """
    Conflate buildings where all the data is in the same postgres database
    using the Underpass raw data schema.

    Args:
        dburi (str): The URI for the existing OSM data

    This is not fast for large areas!
    """
    timer = Timer(text="conflateData() took {seconds:.0f}s")
    timer.start()
    # Find duplicate buildings in the same database
    #sql = f"DROP VIEW IF EXISTS overlap_view;CREATE VIEW overlap_view AS SELECT ST_Area(ST_INTERSECTION(g1.geom::geography, g2.geom::geography)) AS area,g1.osm_id AS id1,g1.geom as geom1,g2.osm_id AS id2,g2.geom as geom2 FROM {self.view} AS g1, {self.view} AS g2 WHERE ST_OVERLAPS(g1.geom, g2.geom) AND (g1.tags->>'building' IS NOT NULL AND g2.tags->>'building' IS NOT NULL)"
    #sql = "SELECT * FROM (SELECT ways_view.id, tags, ROW_NUMBER() OVER(PARTITION BY geom ORDER BY ways_view.geom asc) AS Row, geom FROM ONLY ways_view) dups WHERE dups.Row > 1"
    # Make a new postgres VIEW of all overlapping or touching buildings
    #log.info(f"Looking for overlapping buildings in \"{self.uri['dbname']}\", this make take awhile...")
    #print(sql)
    # Views must be dropped in the right order
    sql = f"DROP TABLE IF EXISTS dups_view CASCADE; DROP TABLE IF EXISTS osm_view CASCADE;DROP TABLE IF EXISTS ways_view CASCADE;"
    result = self.db.queryDB(sql)

    if self.boundary:
        self.db.clipDB(self.boundary)

    log.debug(f"Clipping OSM database")
    ewkt = shape(self.boundary)
    uri = uriParser(dburi)
    log.debug(f"Extracting OSM subset from \"{uri['dbname']}\"")
    sql = f"CREATE TABLE osm_view AS SELECT osm_id,tags,geom FROM dblink('dbname={uri['dbname']}', 'SELECT osm_id,tags,geom FROM ways_poly') AS t1(osm_id int, tags jsonb, geom geometry) WHERE ST_CONTAINS(ST_GeomFromEWKT('SRID=4326;{ewkt}'), geom) AND tags->>'building' IS NOT NULL"
    # print(sql)
    result = self.db.queryDB(sql)

    sql = f"CREATE TABLE dups_view AS SELECT ST_Area(ST_INTERSECTION(g1.geom::geography, g2.geom::geography)) AS area,g1.osm_id AS id1,g1.geom as geom1,g1.tags AS tags1,g2.osm_id AS id2,g2.geom as geom2, g2.tags AS tags2 FROM ways_view AS g1, osm_view AS g2 WHERE ST_INTERSECTS(g1.geom, g2.geom) AND g2.tags->>'building' IS NOT NULL"
    print(sql)
    result = self.db.queryDB(sql)

cleanDuplicates

cleanDuplicates()

Delete the entries from the duplicate building view.

Returns:

Type Description
FeatureCollection

The entries from the datbase table

Source code in conflator/conflateBuildings.py
121
122
123
124
125
126
127
128
129
130
131
132
def cleanDuplicates(self):
    """
    Delete the entries from the duplicate building view.

    Returns:
        (FeatureCollection): The entries from the datbase table
    """
    log.debug(f"Removing duplicate buildings from ways_view")
    sql = f"DELETE FROM ways_view WHERE osm_id IN (SELECT id1 FROM dups_view)"

    result = self.db.queryDB(sql)
    return True

getNew

getNew()

Get only the new buildings

Returns:

Type Description
FeatureCollection

The entries from the datbase table

Source code in conflator/conflateBuildings.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def getNew(self):
    """
    Get only the new buildings

    Returns:
        (FeatureCollection): The entries from the datbase table
    """
    sql = f"SELECT osm_id,geom,tags FROM ways_view"
    result = self.db.queryDB(sql)
    features = list()
    for item in result:
        # log.debug(item)
        entry = {'osm_id': item[0]}
        entry.update(item[2])
        geom = wkb.loads(item[1])
        features.append(Feature(geometry=geom, properties=entry))

    log.debug(f"{len(features)} new features found")
    return FeatureCollection(features)

findHighway

findHighway(feature)

Find the nearest highway to a feature

Parameters:

Name Type Description Default
feature Feature

The feature to check against

required
Source code in conflator/conflateBuildings.py
154
155
156
157
158
159
160
161
162
163
def findHighway(self,
                feature: Feature,
                ):
    """
    Find the nearest highway to a feature

    Args:
        feature (Feature): The feature to check against
    """
    pass

getDuplicates

getDuplicates()

Get the entries from the duplicate building view.

Returns:

Type Description
FeatureCollection

The entries from the datbase table

Source code in conflator/conflateBuildings.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def getDuplicates(self):
    """
    Get the entries from the duplicate building view.

    Returns:
        (FeatureCollection): The entries from the datbase table
    """
    sql = f"SELECT area,id1,geom1,tags1,id2,geom2,tags2 FROM dups_view"
    result = self.db.queryDB(sql)
    features = list()
    for item in result:
        #log.debug(item)
        # First building identified
        entry = {'area': float(item[0]), 'id': int(item[1])}
        geom = wkb.loads(item[2])
        entry.update(item[3])
        features.append(Feature(geometry=geom, properties=entry))

        # Second building identified
        entry = {'area': float(item[0]), 'id': int(item[4])}
        entry['id'] = int(item[4])
        geom = wkb.loads(item[5])
        entry.update(item[6])
        # FIXME: Merge the tags from the buildings into the OSM feature
        # entry.update(item[3])
        features.append(Feature(geometry=geom, properties=entry))

    log.debug(f"{len(features)} duplicate features found")
    return FeatureCollection(features)

main

main()

This main function lets this class be run standalone by a bash script

Source code in conflator/conflateBuildings.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def main():
    """This main function lets this class be run standalone by a bash script"""
    parser = argparse.ArgumentParser(
        prog="conflateDB",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="This program conflates external data with existing features from OSM.",
        epilog="""
    This program conflates external datasets with OSM data using a postgresql database.
        """,
    )
    parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
    parser.add_argument("-d", "--dburi", required=True, help="Source Database URI")
    parser.add_argument("-o", "--osmuri", required=True, help="OSM Database URI")
    parser.add_argument("-b", "--boundary", required=True,
                        help="Boundary polygon to limit the data size")
    # parser.add_argument("-o", "--outfile", help="Post conflation output file")

    args = parser.parse_args()

    # if verbose, dump to the terminal.
    if args.verbose:
        log.setLevel(logging.DEBUG)
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            "%(threadName)10s - %(name)s - %(levelname)s - %(message)s"
        )
        ch.setFormatter(formatter)
        log.addHandler(ch)

    file = open(args.boundary, 'r')
    boundary = geojson.load(file)
    if 'features' in boundary:
        poly = boundary['features'][0]['geometry']
    else:
        poly = boundary['geometry']
    cdb = ConflateBuildings(args.dburi, poly)
    cdb.overlapDB(args.osmuri)
    features = cdb.getDuplicates()

    # FIXME: These are only for debugging
    file = open("foo.geojson", 'w')
    geojson.dump(features, file)
    log.info(f"Wrote foo.geojson for duplicates")

    cdb.cleanDuplicates()
    features = cdb.getNew()
    file = open("bar.geojson", 'w')
    geojson.dump(features, file)

    log.info(f"Wrote bar.geojson for new buildings")

options: show_source: false heading_level: 3

ConflatePOI

ConflatePOI(dburi=None, boundary=None, threshold=7)

Bases: object

Parameters:

Name Type Description Default
dburi str

The DB URI

None
boundary Polygon

The AOI of the project

None
threshold int

The distance in meters for distance calculations

7

Returns:

Type Description
ConflatePOI

An instance of this object

Source code in conflator/conflatePOI.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(self,
             dburi: str = None,
             boundary: Polygon = None,
             threshold: int = 7,
             ):
    """
    This class conflates data that has been imported into a postgres
    database using the Underpass raw data schema.

    Args:
        dburi (str): The DB URI
        boundary (Polygon): The AOI of the project
        threshold (int): The distance in meters for distance calculations

    Returns:
        (ConflatePOI): An instance of this object
    """
    self.data = dict()
    self.db = None
    self.tolerance = threshold # Distance in meters for conflating with postgis
    self.boundary = boundary
    # Use a common select so it's consistent when parsing results
    self.select = "SELECT osm_id,tags,version,ST_AsText(geom),ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;%s\'))"
    if dburi:
        # for thread in range(0, cores + 1):
        self.db = GeoSupport(dburi)
        # self.db.append(db)
        # We only need to clip the database into a new table once
        if boundary:
            self.db.clipDB(boundary, self.db.db)
            self.db.clipDB(boundary, self.db.db, "nodes_view", "nodes")

overlaps

overlaps(feature)

Conflate a POI against all the features in a GeoJson file

Parameters:

Name Type Description Default
feature dict

The feature to conflate

required

Returns:

Type Description
dict

The modified feature

Source code in conflator/conflatePOI.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def overlaps(self,
            feature: dict,
            ):
    """
    Conflate a POI against all the features in a GeoJson file

    Args:
        feature (dict): The feature to conflate

    Returns:
        (dict):  The modified feature
    """
    # Most smartphone GPS are 5-10m off most of the time, plus sometimes
    # we're standing in front of an amenity and recording that location
    # instead of in the building.
    gps_accuracy = 10
    # this is the treshold for fuzzy string matching
    match_threshold = 80
    # log.debug(f"conflateFile({feature})")
    hits = False
    data = dict()
    geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"])))
    wkt = shape(geom)
    for existing in self.data['features']:
        id = int(existing['properties']['id'])
        entry = shapely.from_geojson(str(existing))
        if entry.geom_type != 'Point':
            center = shapely.centroid(entry)
        else:
            center = entry
            # dist = shapely.hausdorff_distance(center, wkt)
            # if 'name' in existing['properties']:
            #     print(f"DIST1: {dist}, {existing['properties']['name']}")
        # x = shapely.distance(wkt, entry)
        # haversine reverses the order of lat & lon from what shapely uses. We
        # use this as meters is easier to deal with than cartesian coordinates.
        x1 = (center.coords[0][1], center.coords[0][0])
        x2 = (wkt.coords[0][1], wkt.coords[0][0])
        dist = haversine(x1, x2, unit=Unit.METERS)
        if dist < gps_accuracy:
            # if 'name' in existing['properties']:
            # log.debug(f"DIST2: {dist}")
            # log.debug(f"Got a Hit! {feature['tags']['name']}")
            for key,value in feature['tags'].items():
                if key in self.analyze:
                    if key in existing['properties']:
                        result = fuzz.ratio(value, existing['properties'][key])
                        if result > match_threshold:
                            # log.debug(f"Matched: {result}: {feature['tags']['name']}")
                            existing['properties']['fixme'] = "Probably a duplicate!"
                            log.debug(f"Got a dup in file!!! {existing['properties']['name'] }")
                            hits = True
                            break
        if hits:
            version = int(existing['properties']['version'])
            # coords = feature['geometry']['coordinates']
            # lat = coords[1]
            # lon = coords[0]
            attrs = {'id': id, 'version': version, 'lat': feature['attrs']['lat'], 'lon': feature['attrs']['lon']}
            tags = existing['properties']
            tags['fixme'] = "Probably a duplicate!"
            # Data extracts for ODK Collect
            del tags['title']
            del tags['label']
            if 'building' in tags:
                return {'attrs': attrs, 'tags': tags, 'refs': list()}
            return {'attrs': attrs, 'tags': tags}
    return dict()

queryToFeature

queryToFeature(results)

Convert the results of an SQL to a GeoJson Feature

Parameters:

Name Type Description Default
results list

The results of the query

required

Returns:

Type Description
list

a list of the features fromn the results

Source code in conflator/conflatePOI.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def queryToFeature(self,
                   results: list,
                   ):
    """
    Convert the results of an SQL to a GeoJson Feature

    Args:
        results (list): The results of the query

    Returns:
        (list): a list of the features fromn the results
    """

    features = list()
    for entry in results:
        osm_id = int(entry[0])
        tags = entry[1]
        version = int(entry[2])
        coords = shapely.from_wkt(entry[3])
        dist = entry[4]
        # ways have an additional column
        if len(entry) == 6:
            refs = entry[5]
        else:
            refs = list()
        if coords.geom_type == 'Polygon':
            center = shapely.centroid(coords)
            lat = center.y
            lon = center.x
            tags['geom_type'] = 'way'
        elif coords.geom_type == "Point":
            lat = coords.y
            lon = coords.x
            tags['geom_type'] = 'node'
        else:
            log.error(f"Unsupported geometry type: {coords.geom_type}")
        # match = entry[5] # FIXME: for debugging
        # the timestamp attribute gets added when it's uploaded to OSM.
        attrs = {'id': osm_id,
                'version': version,
                'lat': lat,
                'lon': lon,
                }
        tags['dist'] = dist
        # tags['match'] = match # FIXME: for debugging
        # tags['fixme'] = "Probably a duplicate node!"
        features.append({'attrs': attrs, 'tags': tags, 'refs': refs})

    return features

checkTags

checkTags(feature, osm)

Check tags between 2 features.

Parameters:

Name Type Description Default
feature Feature

The feature from the external dataset

required
osm dict

The result of the SQL query

required

Returns:

Type Description
int

The nunber of tag matches

dict

The updated tags

Source code in conflator/conflatePOI.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def checkTags(self,
              feature: Feature,
              osm: dict,
              ):
    """
    Check tags between 2 features.

    Args:
        feature (Feature): The feature from the external dataset
        osm (dict): The result of the SQL query

    Returns:
        (int): The nunber of tag matches
        (dict): The updated tags
    """
    tags = osm['tags']
    hits = 0
    match_threshold = 80
    if osm['tags']['dist'] > float(self.tolerance):
        return 0, osm['tags']
    for key, value in feature['tags'].items():
        if key in tags:
            ratio = fuzz.ratio(value, tags[key])
            if ratio > match_threshold:
                hits += 1
            else:
                if key != 'note':
                    tags[f'old_{key}'] = value
        tags[key] = value

    return hits, tags

conflateData

conflateData(data, threshold=7)

Conflate all the data. This the primary interfacte for conflation.

Parameters:

Name Type Description Default
data list

A list of all the entries in the OSM XML input file

required
threshold int

The threshold for distance calculations

7

Returns:

Type Description
dict

The modified features

Source code in conflator/conflatePOI.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def conflateData(self,
                 data: list,
                 threshold: int = 7,
                 ):
    """
    Conflate all the data. This the primary interfacte for conflation.

    Args:
        data (list): A list of all the entries in the OSM XML input file
        threshold (int): The threshold for distance calculations

    Returns:
        (dict):  The modified features
    """
    timer = Timer(text="conflateData() took {seconds:.0f}s")
    timer.start()
    # Use fuzzy string matching to handle minor issues in the name column,
    # which is often used to match an amenity.
    if len(self.data) == 0:
        self.db.queryDB("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch")
    log.debug(f"conflateData() called! {len(data)} features")

    # A chunk is a group of threads
    entries = len(data)
    chunk = round(len(data) / cores)

    if True: # FIXME: entries <= chunk:
        result = conflateThread(data, self)
        timer.stop()
        return result

    # Chop the data into a subset for each thread
    newdata = list()
    future = None
    result = None
    index = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
        i = 0
        subset = dict()
        futures = list()
        for key, value in data.items():
            subset[key] = value
            if i == chunk:
                i = 0
                result = executor.submit(conflateThread, subset, self)
                index += 1
                # result.add_done_callback(callback)
                futures.append(result)
                subset = dict()
            i += 1
        for future in concurrent.futures.as_completed(futures):
            log.debug(f"Waiting for thread to complete..")
            # print(f"YYEESS!! {future.result(timeout=10)}")
            newdata.append(future.result(timeout=5))
    timer.stop()
    return newdata

queryWays

queryWays(feature, db=None)

Conflate a POI against all the ways in a postgres view

Parameters:

Name Type Description Default
feature Feature

The feature to conflate

required
db GeoSupport

The datbase connection to use

None

Returns:

Type Description
list

The data with tags added from the conflation

Source code in conflator/conflatePOI.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
    def queryWays(self,
                    feature: Feature,
                    db: GeoSupport = None,
                    ):
        """
        Conflate a POI against all the ways in a postgres view

        Args:
            feature (Feature): The feature to conflate
            db (GeoSupport): The datbase connection to use

        Returns:
            (list): The data with tags added from the conflation
        """
        # log.debug(f"conflateWay({feature})")
        hits = 0
        result = list()
        geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"])))
        wkt = shape(geom)

        # cleanval = escape(value)
        # Get all ways close to this feature.
#        query = f"SELECT osm_id,tags,version,ST_AsText(ST_Centroid(geom)),ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) FROM ways_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} ORDER BY ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\'))"
        query = f"{self.select}" % wkt.wkt
        query += f", refs FROM ways_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} ORDER BY ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\'))"
        #log.debug(query)
        result = list()
        if db:
            result = db.queryDB(query)
        else:
            result = self.db.queryDB(query)
        if len(result) > 0:
            hits += 1
        else:
            log.warning(f"No results at all for {query}")

        return result

queryNodes

queryNodes(feature, db=None)

Find all the nodes in the view within a certain distance that are buildings or amenities.

Parameters:

Name Type Description Default
feature Feature

The feature to use as the location

required
db GeoSupport

The database connection to use

None

Returns:

Type Description
list

The results of the conflation

Source code in conflator/conflatePOI.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
def queryNodes(self,
                 feature: Feature,
                 db: GeoSupport = None,
                 ):
    """
    Find all the nodes in the view within a certain distance that
    are buildings or amenities.

    Args:
        feature (Feature): The feature to use as the location
        db (GeoSupport): The database connection to use

    Returns:
        (list): The results of the conflation
    """
    # log.debug(f"queryNodes({feature})")
    hits = 0
    geom = Point((float(feature["attrs"]["lon"]), float(feature["attrs"]["lat"])))
    wkt = shape(geom)
    result = list()
    ratio = 1

    # for key,value in feature['tags'].items():
    # print(f"NODE: {key} = {value}")
    # if key not in self.analyze:
    #     continue

    # Use a Geography data type to get the answer in meters, which
    # is easier to deal with than degress of the earth.
    # cleanval = escape(value)
    # query = f"SELECT osm_id,tags,version,ST_AsEWKT(geom),ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')),levenshtein(tags->>'{key}', '{cleanval}') FROM nodes_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} AND levenshtein(tags->>'{key}', '{cleanval}') <= {ratio}"
    # AND (tags->>'amenity' IS NOT NULL OR tags->>'shop' IS NOT NULL)"
    query = f"{self.select}" % wkt.wkt
    query += f" FROM nodes_view WHERE ST_Distance(geom::geography, ST_GeogFromText(\'SRID=4326;{wkt.wkt}\')) < {self.tolerance} AND (tags->>'amenity' IS NOT NULL OR tags->>'building' IS NOT NULL)"
    #log.debug(query)
    # FIXME: this currently only works with a local database,
    # not underpass yet
    if db:
        result = db.queryDB(query)
    else:
        result = self.db.queryDB(query)
    # log.debug(f"Got {len(result)} results")
    if len(result) > 0:
        hits += 1
        # break
    # else:
    #     log.warning(f"No results at all for {query}")

    return result

conflateThread

conflateThread(features, cp)

Conflate a subset of the data

Parameters:

Name Type Description Default
features list

The feature to conflate

required
cp ConflatePOI

The top level class

required

Returns:

Type Description
list

The results of the conflation

Source code in conflator/conflatePOI.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
def conflateThread(features: list,
                   cp: ConflatePOI,
                   ):
    """
    Conflate a subset of the data

    Args:
        features (list): The feature to conflate
        cp (ConflatePOI): The top level class

    Returns:
        (list): The results of the conflation
    """
    timer = Timer(text="conflateThread() took {seconds:.0f}s")
    timer.start()
    log.debug(f"conflateThread() called! {len(features)} features")
    result = dict()
    dups = 0
    # This is brute force, slow but accurate. Process each feature
    # and look for a possible match with existing data.
    merged = list()
    for key, value in features.items():
        id = int(value['attrs']['id'])
        geom = Point((float(value["attrs"]["lon"]), float(value["attrs"]["lat"])))
        if not shapely.contains(shape(cp.boundary), geom):
            # log.debug(f"Point is not in boundary!")
            continue
        # Each of the conflation methods take a single feature
        # as a parameter, and returns a possible match or a zero
        # length dictionary.
        results = list()
        if id > 0:
            # Any feature ID greater than zero is existing data.
            # Any feature ID less than zero is new data collected
            # using geopoint in the XLSForm.
            results = cp.queryById(value)
        elif id < 0:
            results = cp.queryNodes(value)
            if len(results) == 0:
                # log.warning(f"No results in nodes at all for {value}")
                results = cp.queryWays(value)
                if len(results) == 0:
                    log.warning(f"No results in ways at all for {value}")
                    # value['fixme'] = "Probably a new feature"
                    # merged.append(value)

        if len(results) == 0:
            merged.append(value)
            continue
            # log.error(f"There are no results!")

            # Merge the tags and attributes together, the OSM data and
            # data. If no match is found, the ODK data is used to
            # eate a new feature.
        if len(results) > 1:
            log.warning(f"Got more than one result! Got {len(results)}")

            for entry in cp.queryToFeature(results):
                hits, tags = cp.checkTags(value, entry)
                log.debug(f"Got {hits} out of {len(tags)} matched for {tags}")
                if hits > 0:
                    dups += 1
                    tags['fixme'] = f"Probably a duplicate, got {hits} matches"
                    entry['tags'] = tags
                    entry['attrs']['version'] += 1
                    merged.append(entry)
                    break
                else:
                    merged.append(value)

    timer.stop()
    log.debug(f"Found {dups} duplicates")
    return merged

main

main()

This main function lets this class be run standalone by a bash script

Source code in conflator/conflatePOI.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
def main():
    """This main function lets this class be run standalone by a bash script"""
    parser = argparse.ArgumentParser(
        prog="conflator",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="This program conflates external data with existing features from OSM.",
        epilog="""
    This program conflates external datasets with OSM data using a postgresql database.
        """,
    )
    parser.add_argument("-v", "--verbose", action="store_true", help="verbose output")
    parser.add_argument("-o", "--osmuri", required=True, help="OSM Database URI")
    parser.add_argument("-t", "--threshold", default=7,
                        help="Threshold for distance calculations")
    parser.add_argument("-i", "--infile", required=True,
                        help="GeoJson or OSM XML file to conflate")
    parser.add_argument("-b", "--boundary",
                        help="Boundary polygon to limit the data size")

    args = parser.parse_args()
    # if verbose, dump to the terminal.
    if args.verbose:
        log.setLevel(logging.DEBUG)
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            "%(threadName)10s - %(name)s - %(levelname)s - %(message)s"
        )
        ch.setFormatter(formatter)
        log.addHandler(ch)

    outfile = os.path.basename(args.infile.replace('.osm', '-foo.osm'))

    # This is the existing OSM data, a database or a file
    file = open(args.boundary, 'r')
    boundary = geojson.load(file)
    if 'features' in boundary:
        poly = boundary['features'][0]['geometry']
    else:
        poly = boundary['geometry']
    extract = ConflatePOI(args.osmuri, poly, args.threshold)

    if extract:
        odkf = OsmFile(outfile)
        osm = odkf.loadFile(args.infile)
        #odkf.dump()
    else:
        log.error("No ODK data source specified!")
        parser.print_help()
        quit()

    # This returns a list of lists of dictionaries. Each thread returns
    # a list of the features, and len(data) is thre number of CPU cores.
    data = extract.conflateData(osm)
    out = list()
    #print(data)
    for entry in data:
        if 'geom_type' not in entry['tags']:
            if 'lat' in entry['attrs']:
                entry['tags']['geom_type'] = 'node'
        if entry['tags']['geom_type'] == 'way':
            del entry['tags']['geom_type']
            out.append(odkf.createWay(entry, True))
        else:
            del entry['tags']['geom_type']
            out.append(odkf.createNode(entry, True))
        # this isn't needed anymore

    odkf.write(out)
    log.info(f"Wrote {outfile}")

options: show_source: false heading_level: 3


Last update: July 2, 2024