Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge least feature count polygons with neighbouring polygons #58

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 112 additions & 117 deletions fmtm_splitter/fmtm_algorithm.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ DROP TABLE IF EXISTS polygonsnocount;
DO $$
DECLARE
lines_count INTEGER;
num_buildings INTEGER;
BEGIN
-- Check if ways_line has any data
SELECT COUNT(*) INTO lines_count
Expand Down Expand Up @@ -135,7 +134,6 @@ USING gist (geom);
-- Clean up the table which may have gaps and stuff from spatial indexing
-- VACUUM ANALYZE buildings;


DROP TABLE IF EXISTS splitpolygons;
CREATE TABLE splitpolygons AS (
WITH polygonsfeaturecount AS (
Expand Down Expand Up @@ -163,49 +161,49 @@ USING gist (geom);
DROP TABLE polygonsnocount;


DROP TABLE IF EXISTS lowfeaturecountpolygons;
CREATE TABLE lowfeaturecountpolygons AS (
-- Grab the polygons with fewer than the requisite number of features
WITH lowfeaturecountpolys AS (
SELECT *
FROM splitpolygons AS p
-- TODO: feature count should not be hard-coded
WHERE p.numfeatures < %(num_buildings)s
),

-- Find the neighbors of the low-feature-count polygons
-- Store their ids as n_polyid, numfeatures as n_numfeatures, etc
allneighborlist AS (
SELECT
p.*,
pf.polyid AS n_polyid,
pf.area AS n_area,
p.numfeatures AS n_numfeatures,
-- length of shared boundary to make nice merge decisions
ST_LENGTH2D(ST_INTERSECTION(p.geom, pf.geom)) AS sharedbound
FROM lowfeaturecountpolys AS p
INNER JOIN splitpolygons AS pf
-- Anything that touches
ON ST_TOUCHES(p.geom, pf.geom)
-- But eliminate those whose intersection is a point, because
-- polygons that only touch at a corner shouldn't be merged
AND ST_GEOMETRYTYPE(ST_INTERSECTION(p.geom, pf.geom)) != 'ST_Point'
-- Sort first by polyid of the low-feature-count polygons
-- Then by descending featurecount and area of the
-- high-feature-count neighbors (area is in case of equal
-- featurecounts, we'll just pick the biggest to add to)
ORDER BY p.polyid ASC, p.numfeatures DESC, pf.area DESC
-- OR, maybe for more aesthetic merges:
-- order by p.polyid, sharedbound desc
)

SELECT DISTINCT ON (a.polyid) * FROM allneighborlist AS a
);
ALTER TABLE lowfeaturecountpolygons ADD PRIMARY KEY (polyid);
SELECT POPULATE_GEOMETRY_COLUMNS('public.lowfeaturecountpolygons'::regclass);
CREATE INDEX lowfeaturecountpolygons_idx
ON lowfeaturecountpolygons
USING gist (geom);
-- DROP TABLE IF EXISTS lowfeaturecountpolygons;
-- CREATE TABLE lowfeaturecountpolygons AS (
-- -- Grab the polygons with fewer than the requisite number of features
-- WITH lowfeaturecountpolys AS (
-- SELECT *
-- FROM splitpolygons AS p
-- -- TODO: feature count should not be hard-coded
-- WHERE p.numfeatures < %(num_buildings)s
-- ),

-- -- Find the neighbors of the low-feature-count polygons
-- -- Store their ids as n_polyid, numfeatures as n_numfeatures, etc
-- allneighborlist AS (
-- SELECT
-- p.*,
-- pf.polyid AS n_polyid,
-- pf.area AS n_area,
-- p.numfeatures AS n_numfeatures,
-- -- length of shared boundary to make nice merge decisions
-- ST_LENGTH2D(ST_INTERSECTION(p.geom, pf.geom)) AS sharedbound
-- FROM lowfeaturecountpolys AS p
-- INNER JOIN splitpolygons AS pf
-- -- Anything that touches
-- ON ST_TOUCHES(p.geom, pf.geom)
-- -- But eliminate those whose intersection is a point, because
-- -- polygons that only touch at a corner shouldn't be merged
-- AND ST_GEOMETRYTYPE(ST_INTERSECTION(p.geom, pf.geom)) != 'ST_Point'
-- -- Sort first by polyid of the low-feature-count polygons
-- -- Then by descending featurecount and area of the
-- -- high-feature-count neighbors (area is in case of equal
-- -- featurecounts, we'll just pick the biggest to add to)
-- ORDER BY p.polyid ASC, p.numfeatures DESC, pf.area DESC
-- -- OR, maybe for more aesthetic merges:
-- -- order by p.polyid, sharedbound desc
-- )

-- SELECT DISTINCT ON (a.polyid) * FROM allneighborlist AS a
-- );
-- ALTER TABLE lowfeaturecountpolygons ADD PRIMARY KEY (polyid);
-- SELECT POPULATE_GEOMETRY_COLUMNS('public.lowfeaturecountpolygons'::regclass);
-- CREATE INDEX lowfeaturecountpolygons_idx
-- ON lowfeaturecountpolygons
-- USING gist (geom);
-- VACUUM ANALYZE lowfeaturecountpolygons;


Expand Down Expand Up @@ -245,7 +243,7 @@ CREATE TABLE clusteredbuildings AS (
*,
ST_CLUSTERKMEANS(
b.geom,
CAST((b.numfeatures / %(num_buildings)s) + 1 AS integer)
CAST((b.numfeatures / b.%(num_buildings)s) + 1 AS integer)
)
OVER (PARTITION BY b.polyid)
AS cid
Expand Down Expand Up @@ -335,7 +333,6 @@ USING gist (geom);

--VACUUM ANALYZE unsimplifiedtaskpolygons;


--*****************************Simplify*******************************
-- Extract unique line segments
DROP TABLE IF EXISTS taskpolygons;
Expand Down Expand Up @@ -380,78 +377,70 @@ CREATE TABLE taskpolygons AS (
FROM taskpolygonsnoindex AS tpni
);

-- ALTER TABLE taskpolygons ADD PRIMARY KEY(taskid);

-- Step 1: Identify polygons without any buildings
DROP TABLE IF EXISTS no_building_polygons;
CREATE TABLE no_building_polygons AS (
SELECT
tp.*,
tp.geom AS no_building_geom
FROM taskpolygons AS tp
LEFT JOIN buildings AS b ON ST_INTERSECTS(tp.geom, b.geom)
WHERE b.geom IS NULL
);

-- Step 2: Identify neighboring polygons
DROP TABLE IF EXISTS neighboring_polygons;
CREATE TABLE neighboring_polygons AS (
SELECT
nb.*,
nb.geom AS neighbor_geom,
nb_building_count,
nbp.no_building_geom
FROM taskpolygons AS nb
INNER JOIN no_building_polygons AS nbp
-- Finds polygons that touch each other
ON ST_TOUCHES(nbp.no_building_geom, nb.geom)
LEFT JOIN (
-- Step 3: Count buildings in the neighboring polygons
SELECT
nb.geom,
COUNT(b.geom) AS nb_building_count
FROM taskpolygons AS nb
LEFT JOIN buildings AS b ON ST_INTERSECTS(nb.geom, b.geom)
GROUP BY nb.geom
) AS building_counts
ON nb.geom = building_counts.geom
);
ALTER TABLE taskpolygons ADD PRIMARY KEY (taskid);
SELECT POPULATE_GEOMETRY_COLUMNS('public.taskpolygons'::regclass);
CREATE INDEX taskpolygons_idx
ON taskpolygons
USING gist (geom);

-- Step 4: Find the optimal neighboring polygon to avoid,
-- same polygons with the smallest number of buildings merging into
-- multiple neighboring polygons
DROP TABLE IF EXISTS optimal_neighbors;
CREATE TABLE optimal_neighbors AS (
SELECT
nbp.no_building_geom,
nbp.neighbor_geom
FROM neighboring_polygons AS nbp
WHERE nbp.nb_building_count = (
SELECT MIN(nb_building_count)
FROM neighboring_polygons AS np
WHERE np.no_building_geom = nbp.no_building_geom
)
);
-- Merge least feature polygons with neighbouring polygons
DO $$
DECLARE
num_buildings INTEGER := %(num_buildings)s;
min_area NUMERIC; -- Set the minimum area threshold
mean_area NUMERIC;
stddev_area NUMERIC; -- Set the standard deviation
min_buildings INTEGER; -- Set the minimum number of buildings threshold
small_polygon RECORD; -- set small_polygon and nearest_neighbor as record
nearest_neighbor RECORD; -- in order to use them in the loop
BEGIN
min_buildings := num_buildings * 0.5;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried modifying this number a bit to see the result?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0.5 is ok, but as a user I would imagine if they requested 15 buildings per task, but only got 8 they might be a bit disappointed 😅

Does 0.7 change things by a lot?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This discrepancy will obviously scale linearly: say they request 50 buildings average and get 25 minimum.

But personally, I think if users are requesting 50 geoms per area there is something wrong with their methodology! I would imagine 10 buildings per task is more realistic for an individual or team 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh crap i didn't see these comments , yeah i had tried with params such as 0.7 but got so big tasks in result. Yeah we can change this in future based on the practical use case scenarios


-- Step 5: Merge the small polygons with their optimal neighboring polygons
UPDATE taskpolygons tp
SET geom = ST_UNION(tp.geom, nbp.no_building_geom)
FROM optimal_neighbors AS nbp
WHERE tp.geom = nbp.neighbor_geom;
DELETE FROM taskpolygons
WHERE geom IN (
SELECT no_building_geom
FROM no_building_polygons
);
-- Find the mean and standard deviation of the area
SELECT
AVG(ST_Area(geom)),
STDDEV_POP(ST_Area(geom))
INTO mean_area, stddev_area
FROM taskpolygons;

-- Set the threshold as mean - standard deviation
min_area := mean_area - stddev_area;

DROP TABLE IF EXISTS no_building_polygons;
DROP TABLE IF EXISTS neighboring_polygons;
DROP TABLE IF EXISTS leastfeaturepolygons;
CREATE TABLE leastfeaturepolygons AS
SELECT taskid, geom
FROM taskpolygons
WHERE ST_Area(geom) < min_area OR (
SELECT COUNT(b.id) FROM buildings b
WHERE ST_Contains(taskpolygons.geom, b.geom)
) < min_buildings; -- find least feature polygons based on the area and feature

FOR small_polygon IN
SELECT * FROM leastfeaturepolygons
LOOP
-- Find the nearest neighbor to merge the small polygon with
FOR nearest_neighbor IN
SELECT taskid, geom, ST_LENGTH2D(ST_Intersection(small_polygon.geom, geom)) as shared_bound
FROM taskpolygons
WHERE taskid NOT IN (SELECT taskid FROM leastfeaturepolygons)
AND ST_Touches(small_polygon.geom, geom)
AND ST_GEOMETRYTYPE(ST_INTERSECTION(small_polygon.geom, geom)) != 'ST_Point'
ORDER BY shared_bound DESC -- Find neighbor polygon based on shared boundary distance
LIMIT 1
LOOP
-- Merge the small polygon into the neighboring polygon
UPDATE taskpolygons
SET geom = ST_Union(geom, small_polygon.geom)
WHERE taskid = nearest_neighbor.taskid;

DELETE FROM taskpolygons WHERE taskid = small_polygon.taskid;
-- Exit the neighboring polygon loop after one successful merge
EXIT;
END LOOP;
END LOOP;
END $$;

SELECT POPULATE_GEOMETRY_COLUMNS('public.taskpolygons'::regclass);
CREATE INDEX taskpolygons_idx
ON taskpolygons
USING gist (geom);
DROP TABLE IF EXISTS leastfeaturepolygons;
-- VACUUM ANALYZE taskpolygons;

-- Generate GeoJSON output
Expand All @@ -464,8 +453,14 @@ FROM (
SELECT
JSONB_BUILD_OBJECT(
'type', 'Feature',
'geometry', ST_ASGEOJSON(geom)::jsonb,
'properties', JSONB_BUILD_OBJECT()
'geometry', ST_ASGEOJSON(t.geom)::jsonb,
'properties', JSONB_BUILD_OBJECT(
'building_count', (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💫

SELECT COUNT(b.id)
FROM buildings AS b
WHERE ST_CONTAINS(t.geom, b.geom)
)
)
) AS feature
FROM taskpolygons
FROM taskpolygons AS t
) AS features;
Loading