CNEFE
CNEFE - Cadastro Nacional de Endereços para Fins Estatísticos
Faces de Quadra
Pegar em 4314902_faces_de_logradouros_2021.prj
mkdir /tmp/POA_FACES/ cd /tmp/POA_FACES wget https://dl.digital-guard.org/5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84.zip zip -T 5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84.zip cd '/tmp/5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84/rs_faces_de_logradouros_2021/' cp 4314902* /tmp/POA_FACES
Supor arquivos logradouro em log:
create database poa; \c poa create extension postgis; ----- shp2pgsql log | psql -q postgres://postgres@localhost/poa
Melhorando os dados:
drop view if exists vw01_log ; create view vw01_log as select *, regexp_replace( coalesce(nm_tip_log,'') || ' '|| coalesce(nm_tit_log,'') ||' '|| coalesce(nm_log,''), '\s+', ' ', 'g') as via_name from log; select gid, cd_setor, cd_quadra, cd_face, via_name, tot_res from vw01_log limit 100;
Pontos
Por cidade temos Censo_Demografico_2022/Arquivos_CNEFE/Municipio, direto latLong em srid=4326.
Pelo cabeçalho podemos copiar para uma tabela texto,
CREATE TABLE ibge_cnefe2022 ( COD_UNICO_ENDERECO text, COD_UF text, COD_MUNICIPIO text, COD_DISTRITO text, COD_SUBDISTRITO text, COD_SETOR text, NUM_QUADRA text, NUM_FACE text, CEP text, DSC_LOCALIDADE text, NOM_TIPO_SEGLOGR text, NOM_TITULO_SEGLOGR text, NOM_SEGLOGR text, NUM_ENDERECO text, DSC_MODIFICADOR text, NOM_COMP_ELEM1 text, VAL_COMP_ELEM1 text, NOM_COMP_ELEM2 text, VAL_COMP_ELEM2 text, NOM_COMP_ELEM3 text, VAL_COMP_ELEM3 text, NOM_COMP_ELEM4 text, VAL_COMP_ELEM4 text, NOM_COMP_ELEM5 text, VAL_COMP_ELEM5 text, LATITUDE real, LONGITUDE real, NV_GEO_COORD text, COD_ESPECIE text, DSC_ESTABELECIMENTO text, COD_INDICADOR_ESTAB_ENDERECO text, COD_INDICADOR_CONST_ENDERECO text, COD_INDICADOR_FINALIDADE_CONST text, COD_TIPO_ESPECI text ); copy ibge_cnefe2022 from '/tmp/POA_FACES/4314902_PORTO_ALEGRE.csv' with (format CSV, delimiter ';', header); ---- drop view if exists vw02_ibge_cnefe2022; create view vw02_ibge_cnefe2022 as select *, ST_GeoHash(geom,9) as ghs from ( select *, regexp_replace( coalesce(nom_tipo_seglogr,'') ||' '|| coalesce(nom_titulo_seglogr,'') ||' '|| coalesce(nom_seglogr,''), '\s+',' ','g') as via_name, ST_SetSRID(ST_MakePoint(longitude::float, latitude::float),4326) as geom from ibge_cnefe2022 ) t ; --select count(*) from ibge_cnefe2022; -- 762239 -- select count(*) from (select distinct ST_GeoHash(geom,9) as ghs, cod_setor, num_quadra, num_face, via_name, num_endereco from vw02_ibge_cnefe2022 -- same where geom is not null and via_name>'' and num_endereco>'' ) t; -- 379957 -- select 379957.0/762239; -- 49,847% = 50%
- Unicidade de endereço: DISTINCT cod_setor, cep, via_name, num_endereco
- Unicidade de posição: DISTINCT ghs
- Filtrando unicidades: primeiro de endereço, porém incluindo array de geohashes distintos. Se forem vizinhos não possuem mesmo prefixo conferir maior distancia entre eles.
CREATE FUNCTION hcodes_sameparent(g text[]) RETURNS boolean AS $f$ SELECT bool_and( CASE WHEN l1=l2 THEN substr(g1,1,l1-1)=substr(g2,1,l2-1) WHEN l1>l2 THEN g1 ~ ('^'||g2) -- WHEN l1=l2+1 ELSE g2 ~ ('^'||g1) -- WHEN l2=l1+1 ELSE false END ) FROM ( select g1, g2, length(g1) l1, length(g2) l2 from unnest(g) WITH ORDINALITY AS a(g1,i1) inner join unnest(g) WITH ORDINALITY AS b(g2,i2) on i1<i2 ) t1 $f$ LANGUAGE SQL; CREATE FUNCTION geoms_maxdist(g geometry[], use_spheroid boolean default false) RETURNS float AS $f$ SELECT MAX( ST_Distance(g1,g2,use_spheroid) ) FROM unnest(g) WITH ORDINALITY AS a(g1,i1) INNER JOIN unnest(g) WITH ORDINALITY AS b(g2,i2) ON i1<i2 $f$ LANGUAGE SQL; create table tmp_ibge_cnefe2022_uniq1 AS -- unicidade básica, ainda com repetição de pontos SELECT DISTINCT ghs, via_name, num_endereco FROM vw02_ibge_cnefe2022 ; -- 379228 de 762239 = select round(100.0*379228.0/762239); = 50% select avg(n) from (select ghs, count(*) n from ibge_cnefe2022_uniq1 group by 1) t where n=1 -- n>1 ; -- 325993 não-duplicados mais 2.16*24642 duplicados = 325993+2.16*24642 = 379220 ~ 379228.
Só na redução de pontos para 50% já é bem drástica, provavelmente inclui casos que diferenciam apenas no complemento. Desses, com duplicidade posicional apenas ~7,6% (24642 de 325993), sendo também razoável o seu descarte de tmp_ibge_cnefe2022_uniq1.
drop table if exists ibge_cnefe2022_uniq2; create table ibge_cnefe2022_uniq2 AS -- remove endereços duplicados em locais diferentes select v.cod_setor, v.num_quadra, v.num_face, v.cep, v.via_name, v.num_endereco, array_agg(v.ghs) ghss, array_agg(v.geom) geoms from( SELECT DISTINCT ON (ghs) ghs, via_name, num_endereco, count(*) OVER (PARTITION BY ghs) AS ghs_count FROM tmp_ibge_cnefe2022_uniq1 ) g -- 325993 INNER JOIN vw02_ibge_cnefe2022 v ON 1=g.ghs_count AND g.ghs=v.ghs group by 1,2,3,4,5,6 order by 1,2,3,4,5,6 ; -- 257387 select count(*) from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where is_same;-- 49597 select count(*) from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same); -- 22523 -- conclusão: vale a pena manter o same, removendo o último dígito, fornecendo um booleano para usuário decidir se valido ou não. -- os casos de não same podem ser resgatados por geometria select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same)) where dist<=20; -- 12193 select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same)) where dist<=5; -- 1848 (não compensa) select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) t2 where is_same) t3 -- 49597 | 2.67 (ok distancia pequena e numero expressivo) select count(*), count(distinct via_name||num_endereco) from (select distinct cod_setor, via_name, num_endereco, cep from vw02_ibge_cnefe2022 ) t; -- 288685 | 277160
Tratamentos finais
drop table ibge_cnefe2022_uniq3; CREATE TABLE ibge_cnefe2022_uniq3 AS SELECT ghss[1] as ghs, cod_setor, num_quadra::int as num_quadra, num_face::int num_face, cep, via_name, num_endereco, false AS is_centroid FROM ibge_cnefe2022_uniq2 WHERE cardinality(ghss)=1 UNION ALL SELECT st_geohash(geoms_centroid(geoms),9) as ghs, cod_setor, num_quadra::int, num_face::int, cep, via_name, num_endereco, true AS is_centroid FROM ibge_cnefe2022_uniq2 WHERE cardinality(ghss)>1 AND (hcodes_sameparent(ghss) OR (geoms_maxdist(geoms,true))[2]<=20) ORDER BY 1 ; -- 249834 ---- copy ibge_cnefe2022_uniq3 to '/tmp/BR-RS-POA2024_CNEFE-ptsEnderecoUnico.csv' CSV HEADER; copy (select gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face, via_name, tot_res, geom FROM vw01_log) TO '/tmp/BR-RS-POA-faces.csv' CSV HEADER; -- COPY 56645 --- CONTAGENS DE JOIN With f AS( SELECT gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face, via_name, tot_res, st_geohash(geom,7) as ghs7 FROM vw01_log ) SELECT count(*) FROM ibge_cnefe2022_uniq3 u INNER JOIN f ON substring(u.cod_setor,1,15)!=f.cd_setor AND substring(u.ghs,1,7)=f.ghs7 AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name ; -- 5893 -- testar se são consistentes ON substring(u.ghs,1,7)=f.ghs7 AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name ------- final só confiaveis: drop table ibge_cnefe2022_uniq_onface; create table ibge_cnefe2022_uniq_onface AS WITH f AS ( SELECT gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face, via_name, tot_res, st_geohash(geom,7) as ghs7, geom FROM vw01_log ) SELECT u.ghs, u.cep, u.via_name, u.num_endereco, u.is_centroid, substring(u.ghs,1,7)=f.ghs7 as is_nearface, ST_ClosestPoint(f.geom, ST_PointFromGeoHash(u.ghs)) as geom FROM ibge_cnefe2022_uniq3 u INNER JOIN f ON substring(u.cod_setor,1,15)=f.cd_setor AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name order by 1 ; -- 198797