Setup steps (for Windows)#
Install software#
- Install PostgreSQL 13.x or higher on your local machine
- After installation, use the PostgreSQL Application Stack Builder to install Spatial > PostGIS 3.0
- Install Azure Data Studio
Create staging folders#
cmd.exe
1
2
|
mkdir c:\gisdata
mkdir c:\gisdata\temp
|
- Create an Azure PostgreSQL server (Single Server).
- Create a database on your Azure server called
gis
. Be sure your server allows connections from your local client IP address.
Activate extensions#
1
2
3
4
|
CREATE EXTENSION postgis;
CREATE EXTENSION fuzzystrmatch;
CREATE EXTENSION postgis_tiger_geocoder;
CREATE EXTENSION address_standardizer;
|
Prepare the loader tables#
Edit the loader tables according to article
- Table
tiger.loader_platform
:
Note: the commands below reference the 7z
and wget
utilities. You can download these from 7-Zip and GNU Wget. I put these utilities in my C:\bin
folder, but you can put them anywhere you like. Just be sure to update the paths in the commands below.
declare_sect (Row 2—windows)
1
2
3
4
5
6
7
8
9
10
11
12
|
set TMPDIR=C:\gisdata\temp
set UNZIPTOOL="C:\bin\7z.exe"
set WGETTOOL="C:\bin\wget.exe"
set PGBIN=C:\Program Files\PostgreSQL\13\bin\
set PGPORT=5432
set PGHOST=mypgsqlserver.postgres.database.azure.com
set PGUSER=user@mypgsqlserver
set PGPASSWORD=(pw)
set PGDATABASE=gis
set PSQL="C:\Program Files\PostgreSQL\13\bin\psql.exe"
set SHP2PGSQL="C:\Program Files\PostgreSQL\13\bin\shp2pgsql.exe"
cd C:\gisdata
|
- Table
tiger.loader_variables
:
tiger_year |
website_root |
staging_fold |
2019 |
https://www2.census.gov/geo/tiger/TIGER2019 |
C:\gisdata |
Generate nation loader script#
Note: This script may contain your cleartext password
1
|
psql -h mypgsqlserver.postgres.database.azure.com -U user@mypgsqlserver -c "SELECT Loader_Generate_Nation_Script('windows')" -d gis -tA > C:\gisdata\nation_script_load.bat
|
Then run C:/gisdata/nation_script_load.bat
Generate state loader script#
Note: This script may contain your cleartext password
1
|
psql -h mypgsqlserver.postgres.database.azure.com -U user@mypgsqlserver -c "SELECT Loader_Generate_Script(ARRAY['CA','WA','OR','AK','MT','NM','TX'], 'windows')" -d gis -tA > C:\gisdata\state_script_load.bat
|
(or one at a time)
1
|
psql -h mypgsqlserver.postgres.database.azure.com -U user@mypgsqlserver -c "SELECT Loader_Generate_Script(ARRAY['WA'], 'windows')" -d gis -tA > C:/gisdata/state_script_load.bat
|
Then run C:/gisdata/state_script_load.bat
Cleanup#
Run these commands to create indexes and increase performance.
1
2
3
4
5
6
7
8
9
10
11
12
|
SELECT install_missing_indexes();
vacuum analyze tiger.addr;
vacuum analyze tiger.edges;
vacuum analyze tiger.faces;
vacuum analyze tiger.featnames;
vacuum analyze tiger.place;
vacuum analyze tiger.cousub;
vacuum analyze tiger.county;
vacuum analyze tiger.state;
vacuum analyze tiger.zip_lookup_base;
vacuum analyze tiger.zip_state;
vacuum analyze tiger.zip_state_loc;
|
Test geocoding#
1
2
3
4
|
SELECT g.rating, ST_AsText(ST_SnapToGrid(g.geomout,0.00001)) As wktlonlat,
(addy).address As stno, (addy).streetname As street,
(addy).streettypeabbrev As styp, (addy).location As city, (addy).stateabbrev As st,(addy).zip
FROM geocode('1301 4th Ave, Seattle WA 98101',1) As g;
|
Expected results:
rating |
wktlonlat |
stno |
street |
styp |
city |
st |
zip |
0 |
POINT(-122.33498 47.60818) |
1301 |
4th |
Ave |
Seattle |
WA |
98101 |
Note that rating
is 0, which means PostGIS has found an exact match for the address. Non-zero ratings (up to 100) indicate decreasing levels of confidence.
Batch geocoder#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
--- remove any existing attempts
DROP TABLE IF EXISTS address_table;
--- Create an empty table
--- Make the columns match those of the CSV that has the data
--- you want to geocode
CREATE TABLE address_table(
per_id_fk varchar(255),
input_address varchar(255));
--- import data from a CSV - the columns in the CSV should be in the list above
COPY address_table from 'C:\gisdata\TableOfAddresses.csv' WITH DELIMITER ',' NULL 'NA' CSV HEADER;
--- Look at the data you added:
--- SELECT * FROM address_table;
--- Add the columns we'll need for geocoding
ALTER TABLE address_table
ADD lon numeric,
ADD lat numeric,
ADD geomout geometry, -- a point geometry in NAD 83 long lat.
ADD new_address varchar(255),
ADD rating integer;
--- This function loops through the table, one row at a time:
CREATE OR REPLACE FUNCTION mygeocoder()
RETURNS void
AS $$
BEGIN
FOR i IN 1..(SELECT(count(per_id_fk)) from address_table) LOOP
UPDATE address_table
SET (rating, new_address, lon, lat, geomout) = (COALESCE(g.rating,-1),pprint_addy(g.addy), ST_X(g.geomout)::numeric(8,5), ST_Y(g.geomout)::numeric(8,5), (g.geomout) )
FROM (SELECT per_id_fk, input_address FROM address_table WHERE rating IS NULL ORDER BY per_id_fk LIMIT 1)
As a
LEFT JOIN LATERAL geocode(a.input_address,1) As g ON true
WHERE a.per_id_fk = address_table.per_id_fk;
END LOOP;
RETURN;
END;
$$ LANGUAGE plpgsql;
--- Run your geocoder to geocode your address table:
SELECT mygeocoder();
--- Look at your table with geocoded addresses:
SELECT * FROM address_table order by rating desc limit 1000;
|
Testing#
Freely available address data (geoJSON)
Sample 1000 records from the file (linux): shuf -n 1000 us_wa_king-addresses-county.geojson > 1000.json
Python to parse NDJSON into addresses
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
import ujson as json
import pandas as pd
records = map(json.loads, open('us_wa_king-addresses-county.geojson'))
df = pd.DataFrame.from_records(records)
dfp = df['properties']
# df_valid = df[]
i = 0
while i < len(dfp):
prop = dfp.iloc[i]
# skip records with blank city
if prop['city'] != '':
addy = prop['number'] + ' ' + prop['street'] + ' ' + prop['city'] + ' WA ' + prop['postcode']
print(addy)
i = i + 1
|
Benchmark geocode stats (Azure single server)#
Tier |
# of addrs |
Time (sec) |
msec per record |
Non-zero rating ratio |
16 vCores, 555 GB, 1665 IOPS |
1000 |
35 |
35 |
7.2% |
16 vCores, 555 GB, 1665 IOPS |
10000 |
535 |
54 |
6.9% |
4 vCores, 555 GB, 1665 IOPS |
1000 |
65 |
65 |
7.2% |
SQL Snippets#
Because I forget things
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
-- list extensions and versions
SELECT * FROM pg_available_extensions order by name;
-- all tables and row counts
SELECT schemaname,relname,n_live_tup
FROM pg_stat_user_tables
ORDER BY n_live_tup DESC;
-- long running queries
SELECT
pid,
now() - pg_stat_activity.query_start AS duration,
query,
state
FROM pg_stat_activity where now() - query_start > interval '5 second'
AND state != 'idle'
-- all running queries
select * from pg_stat_activity
|
Resources and credits#
https://experimentalcraft.wordpress.com/2017/11/01/how-to-make-a-postgis-tiger-geocoder-in-less-than-5-days/
https://experimentalcraft.wordpress.com/2017/12/21/batch-geocoding-with-postgis/
https://github.com/gravitystorm/postgis/blob/master/extras/tiger_geocoder/tiger_2010/geocode/geocode.sql
https://github.com/michaelvacosta/AzurePostgreSQLGeospatialSample#azurepostgresqlgeospatialsample
https://techcommunity.microsoft.com/t5/azure-database-for-postgresql/importing-spatial-data-to-postgis/ba-p/1255421