-
Notifications
You must be signed in to change notification settings - Fork 1
Populando dados CSV no repositório GIT
Peter edited this page May 17, 2019
·
4 revisions
Lembretes:
- atualizar https://github.com/datasets-br/city-codes com novos dados do IBGE e da Wikidata
- reusar convenções do Stable, por exemplo nomes de cidade por CamelCase.
- IBGE, carregar integral, com algoritmo de scan files, etc. Agro2017 e endereços censo2010, ou direto do portal endereços.
Transformando dados do osm_planet_br
em relatórios de CEP.
CREATE INDEX idx_planet_osm_point_tags_cep ON planet_osm_point ( (tags ->>'addr:postcode') );
-- acrescer idx_ref para citar OSM, IBGE, etc. junto ao gtype.
CREATE TABLE kx_br_ceps_from_osm AS
select 'n' gtype, c.uf, replace(c.lexlabel,'.','_') as municip,
pt.tags->>'addr:postcode' as cep,
count(*) n
from public.vw_osm_city_polygon c INNER JOIN planet_osm_point pt
ON st_contains(c.way,pt.way)
where pt.tags?'addr:postcode'
GROUP BY 1,2,3,4
;
INSERT INTO kx_br_ceps_from_osm(gtype,uf,municip,cep,n)
select 'w' gtype, c.uf, replace(c.lexlabel,'.','_') as municip,
pt.tags->>'addr:postcode' as cep,
count(*) n
from public.vw_osm_city_polygon c INNER JOIN planet_osm_line pt
ON st_contains(c.way,pt.way)
where pt.tags?'addr:postcode'
GROUP BY 1,2,3,4
;
INSERT INTO kx_br_ceps_from_osm(gtype,uf,municip,cep,n)
select 'r' gtype, c.uf, replace(c .lexlabel,'.','_') as municip,
pt.tags->>'addr:postcode' as cep,
count(*) n
from public.vw_osm_city_polygon c INNER JOIN planet_osm_polygon pt
ON c.way && pt.way AND st_contains(c.way,pt.way)
where pt.tags?'addr:postcode'
GROUP BY 1,2,3,4
;
--- for all, IMPORTANT:
UPDATE kx_br_ceps_from_osm set cep= replace(replace(cep,'-',''),'.',''); -- normalize
DELETE FROM kx_br_ceps_from_osm WHERE cep!~ '^\d{8,8}$'; -- ~100
-- report:
SELECT count(distinct cep),count(*) from kx_br_ceps_from_osm; -- 20355 | 24151
SELECT count(distinct cep) from kx_br_ceps_from_osm where n>1; -- 6648
SELECT count(distinct uf||municip) from kx_br_ceps_from_osm where n>1; -- 1116
CREATE VIEW vw_kx_br_ceps_from_osm AS
SELECT uf, municip, cep,
round(SUM(sqrt(n)+ CASE WHEN n<4 THEN n-1 ELSE 2 END)) idx_confirm,
max(idx_ref)||'-'||array_to_string(array_agg(distinct gtype),'') idx_ref
FROM kx_br_ceps_from_osm
-- WHERE uf=%L AND municip=%L
GROUP BY 1,2,3
ORDER BY cep
;
SELECT cod_municipio, count(distinct cep) ceps_distintos, count(*) ceps
from ibge_agro2017 where cep is not null
group by 1
; -- em SP são 640 municípios
SELECT cod_municipio, lexlabel, count(distinct cep) ceps
-- count(*) n, count(*) FILTER (WHERE is_cep_in_range) n_inrange
from vw_ibge_agro2017_full where cep is not null
group by 1,2 having count(distinct cep)>1 order by 3 desc
; -- 446
Em SP 446 municípios com mais de um CEP distinto na região rural, deles 6 com mais de 100, a maioria (357) com menos de 10.
cod_municipio_sp | lexlabel | ceps distintos |
---|---|---|
50308 | sao_paulo | 204 |
9502 | campinas | 159 |
25904 | jundiai | 127 |
30607 | mogi_cruzes | 126 |
49805 | sao_jose_rio_preto | 120 |
52502 | suzano | 102 |
29005 | marilia | 76 |
38709 | piracicaba | 75 |
... | ... | ... |
Linhas de shell para preparar o diretório /tmp/stable
com o material:
rm -r /tmp/stable
# gera mkdir das pastas:
psql -c "select distinct 'mkdir /tmp/stable/'||uf from kx_br_ceps_from_osm"
#... rodar script gerado por copy/paste no terminal
Comando SQL para a formação dos arquivos .csv
de CEPs de cada município, via SQL COPY
:
-- select distinct 'mkdir /tmp/stable/'||uf from kx_br_ceps_from_osm;
-- chown -R postgres:postgres /tmp/stable/
SELECT copy_csv(
uf||'/'||municip||'.csv'
,format('
SELECT cep, idx_confirm, idx_ref
FROM vw_kx_br_ceps_from_osm
WHERE uf=%L AND municip=%L
', uf, municip
)
,true --header
,'/tmp/stable/' -- base path
) FROM (
select distinct uf,municip from kx_br_ceps_from_osm order by 1
) t; -- 1787 arquivos. Quando houverem mais refs reuinir com ';'.
PS: para a situação do CEP no OSM em 2018 ver https://forum.openstreetmap.org/viewtopic.php?id=61604
A função de cópia faz parte da lib do projeto,
/**
* COPY TO CSV HEADER.
*/
CREATE or replace FUNCTION copy_csv(
p_filename text,
p_query text,
p_useheader boolean = true,
p_root text = '/tmp/'
) RETURNS text AS $f$
BEGIN
EXECUTE format(
'COPY (%s) TO %L CSV %s'
,p_query
,p_root||p_filename
,CASE WHEN p_useheader THEN 'HEADER' ELSE '' END
);
RETURN p_filename;
END;
$f$ LANGUAGE plpgsql STRICT;