CNEFE

De Documentação
Revisão de 22h56min de 31 de maio de 2024 por Peter (discussão | contribs) (movendo pontos para as faces)

CNEFE - Cadastro Nacional de Endereços para Fins Estatísticos

Ver https://www.ibge.gov.br/estatisticas/sociais/populacao/38734-cadastro-nacional-de-enderecos-para-fins-estatisticos.html?edicao=40122&t=resultados

Faces de Quadra

Pegar em 4314902_faces_de_logradouros_2021.prj

mkdir /tmp/POA_FACES/

cd /tmp/POA_FACES

wget https://dl.digital-guard.org/5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84.zip

zip -T 5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84.zip

cd '/tmp/5d5daa06e048ad8c83c909babee040e26fd8a54a29ed4cef2d2d220a3bd17b84/rs_faces_de_logradouros_2021/'
cp 4314902* /tmp/POA_FACES

Supor arquivos logradouro em log:

create database poa;
\c poa
create extension postgis;
-----
shp2pgsql  log | psql -q postgres://postgres@localhost/poa

Melhorando os dados:

drop view if exists vw01_log ;
create view vw01_log as select *, regexp_replace( coalesce(nm_tip_log,'') || ' '||  coalesce(nm_tit_log,'') ||' '|| coalesce(nm_log,''), '\s+', ' ', 'g') as via_name from log;

select gid, cd_setor, cd_quadra, cd_face, via_name, tot_res from vw01_log limit 100;

Pontos

Por cidade temos Censo_Demografico_2022/Arquivos_CNEFE/Municipio, direto latLong em srid=4326.

Pelo cabeçalho podemos copiar para uma tabela texto,

CREATE TABLE ibge_cnefe2022 (
COD_UNICO_ENDERECO text,
COD_UF text,
COD_MUNICIPIO text,
COD_DISTRITO text,
COD_SUBDISTRITO text,
COD_SETOR text,
NUM_QUADRA text,
NUM_FACE text,
CEP text,
DSC_LOCALIDADE text,
NOM_TIPO_SEGLOGR text,
NOM_TITULO_SEGLOGR text,
NOM_SEGLOGR text,
NUM_ENDERECO text,
DSC_MODIFICADOR text,
NOM_COMP_ELEM1 text,
VAL_COMP_ELEM1 text,
NOM_COMP_ELEM2 text,
VAL_COMP_ELEM2 text,
NOM_COMP_ELEM3 text,
VAL_COMP_ELEM3 text,
NOM_COMP_ELEM4 text,
VAL_COMP_ELEM4 text,
NOM_COMP_ELEM5 text,
VAL_COMP_ELEM5 text,
LATITUDE real,
LONGITUDE real,
NV_GEO_COORD text,
COD_ESPECIE text,
DSC_ESTABELECIMENTO text,
COD_INDICADOR_ESTAB_ENDERECO text,
COD_INDICADOR_CONST_ENDERECO text,
COD_INDICADOR_FINALIDADE_CONST text,
COD_TIPO_ESPECI text
);
copy ibge_cnefe2022 from '/tmp/POA_FACES/4314902_PORTO_ALEGRE.csv' with  (format CSV, delimiter ';', header);

----
drop view if exists vw02_ibge_cnefe2022;
create view vw02_ibge_cnefe2022 as
 select *, ST_GeoHash(geom,9) as ghs
 from (
  select *, regexp_replace( coalesce(nom_tipo_seglogr,'') ||' '|| coalesce(nom_titulo_seglogr,'') ||' '|| coalesce(nom_seglogr,''), '\s+',' ','g') as via_name,
         ST_SetSRID(ST_MakePoint(longitude::float, latitude::float),4326) as geom
  from ibge_cnefe2022
 ) t
;
--select count(*) from ibge_cnefe2022; -- 762239
-- select count(*) from (select distinct ST_GeoHash(geom,9) as ghs, cod_setor, num_quadra, num_face, via_name, num_endereco from vw02_ibge_cnefe2022 -- same where geom is not null and via_name>'' and num_endereco>'' ) t; -- 379957
-- select 379957.0/762239; -- 49,847% = 50%
  • Unicidade de endereço: DISTINCT cod_setor, cep, via_name, num_endereco
  • Unicidade de posição: DISTINCT ghs
  • Filtrando unicidades: primeiro de endereço, porém incluindo array de geohashes distintos. Se forem vizinhos não possuem mesmo prefixo conferir maior distancia entre eles.
CREATE FUNCTION hcodes_sameparent(g text[]) RETURNS boolean AS $f$
  SELECT  bool_and( CASE
     WHEN l1=l2 THEN substr(g1,1,l1-1)=substr(g2,1,l2-1)
     WHEN l1>l2 THEN g1 ~ ('^'||g2)  -- WHEN l1=l2+1
     ELSE g2 ~ ('^'||g1)  -- WHEN l2=l1+1  ELSE false
     END  )
  FROM ( 
    select g1, g2, length(g1) l1, length(g2) l2
    from unnest(g) WITH ORDINALITY AS a(g1,i1) 
    inner join unnest(g) WITH ORDINALITY AS b(g2,i2) 
      on i1<i2
    ) t1
$f$ LANGUAGE SQL;

CREATE FUNCTION geoms_maxdist(g geometry[], use_spheroid boolean default false) RETURNS float AS $f$
    SELECT MAX( ST_Distance(g1,g2,use_spheroid) )
    FROM unnest(g) WITH ORDINALITY AS a(g1,i1) 
    INNER JOIN unnest(g) WITH ORDINALITY AS b(g2,i2) 
      ON i1<i2
$f$ LANGUAGE SQL;

create table tmp_ibge_cnefe2022_uniq1 AS  -- unicidade básica, ainda com repetição de pontos
   SELECT DISTINCT ghs, via_name, num_endereco
   FROM vw02_ibge_cnefe2022
; -- 379228 de 762239 = select round(100.0*379228.0/762239); = 50%

select avg(n) from (select ghs, count(*) n from ibge_cnefe2022_uniq1 group by 1) t
where n=1 -- n>1
;
-- 325993 não-duplicados mais 2.16*24642 duplicados = 325993+2.16*24642 = 379220 ~ 379228.

Só na redução de pontos para 50% já é bem drástica, provavelmente inclui casos que diferenciam apenas no complemento. Desses, com duplicidade posicional apenas ~7,6% (24642 de 325993), sendo também razoável o seu descarte de tmp_ibge_cnefe2022_uniq1.

drop table if exists ibge_cnefe2022_uniq2;
create table ibge_cnefe2022_uniq2 AS  -- remove endereços duplicados em locais diferentes 
 select v.cod_setor, v.num_quadra, v.num_face, v.cep, v.via_name, v.num_endereco,
       array_agg(v.ghs) ghss, array_agg(v.geom) geoms
 from(  
   SELECT DISTINCT ON (ghs) ghs, via_name, num_endereco, count(*) OVER (PARTITION BY ghs) AS ghs_count
   FROM tmp_ibge_cnefe2022_uniq1 
 ) g -- 325993
 INNER JOIN vw02_ibge_cnefe2022 v ON 1=g.ghs_count AND g.ghs=v.ghs
group by 1,2,3,4,5,6
order by 1,2,3,4,5,6
; -- 257387


select count(*) from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where is_same;-- 49597
select count(*) from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same); -- 22523

-- conclusão: vale a pena manter o same, removendo o último dígito, fornecendo um booleano para usuário decidir se valido ou não.
-- os casos de não same podem ser resgatados por geometria

select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same)) where dist<=20; 
-- 12193
select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) where not(is_same)) where dist<=5; 
-- 1848  (não compensa) 

select count(*), avg(dist) from (select geoms_maxdist(geoms,true) dist from (select cep, via_name, num_endereco, hcodes_sameparent(ghss) as is_same, geoms from ibge_cnefe2022_uniq2 where cardinality(ghss)>1 ) t2 where is_same) t3 
--  49597 | 2.67   (ok distancia pequena e numero expressivo)

select count(*), count(distinct via_name||num_endereco) from (select distinct  cod_setor, via_name, num_endereco, cep
from vw02_ibge_cnefe2022          
) t; -- 288685 | 277160

Tratamentos finais

drop table ibge_cnefe2022_uniq3;
CREATE TABLE  ibge_cnefe2022_uniq3 AS
 SELECT ghss[1] as ghs, cod_setor, num_quadra::int as num_quadra, num_face::int num_face,
        cep, via_name, num_endereco,
        false AS is_centroid
 FROM ibge_cnefe2022_uniq2 WHERE cardinality(ghss)=1

 UNION ALL

 SELECT st_geohash(geoms_centroid(geoms),9) as ghs,
        cod_setor, num_quadra::int, num_face::int, cep, via_name, num_endereco,
        true AS is_centroid
 FROM ibge_cnefe2022_uniq2 
 WHERE cardinality(ghss)>1
       AND (hcodes_sameparent(ghss) OR (geoms_maxdist(geoms,true))[2]<=20)
 ORDER BY 1
;  -- 249834
----
copy ibge_cnefe2022_uniq3 to '/tmp/BR-RS-POA2024_CNEFE-ptsEnderecoUnico.csv' CSV HEADER;
copy (select gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face, via_name, tot_res, geom FROM vw01_log) TO '/tmp/BR-RS-POA-faces.csv' CSV HEADER;
-- COPY 56645
--- CONTAGENS DE JOIN
 With f AS(
 SELECT gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face,
        via_name, tot_res, st_geohash(geom,7) as ghs7
 FROM vw01_log
) 
 SELECT count(*) 
 FROM ibge_cnefe2022_uniq3 u INNER JOIN f
   ON substring(u.cod_setor,1,15)!=f.cd_setor AND substring(u.ghs,1,7)=f.ghs7 AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name 
 ; -- 5893
--  testar se são consistentes ON substring(u.ghs,1,7)=f.ghs7 AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name 

------- final só confiaveis:
drop table ibge_cnefe2022_uniq_onface;
create table ibge_cnefe2022_uniq_onface AS
 WITH f AS (
  SELECT gid, cd_setor, cd_quadra::int as cd_quadra, cd_face::int as cd_face,
        via_name, tot_res, st_geohash(geom,7) as ghs7, geom
  FROM vw01_log
 )
 SELECT u.ghs, u.cep, u.via_name, u.num_endereco, u.is_centroid, substring(u.ghs,1,7)=f.ghs7 as is_nearface,
        ST_ClosestPoint(f.geom, ST_PointFromGeoHash(u.ghs)) as geom 
 FROM ibge_cnefe2022_uniq3 u INNER JOIN f
   ON substring(u.cod_setor,1,15)=f.cd_setor AND f.cd_quadra=u.num_quadra AND f.cd_face=u.num_face AND f.via_name=u.via_name 
   order by 1
 ; -- 198797