forked from vdresch/mapa_da_violencia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
processamento.py
92 lines (70 loc) · 3.74 KB
/
processamento.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
######################################################################################################
#
# The objective of this code is to clean the data provided by Secretaria da Segurança
# Pública. The neighborhoods will be cleaned and the output is a CSV that can be used by Tableau.
# There will be aldo a CSV with metadata about the neighborhoods.
#
# The code takes CSV files located in the folder data, as well as a shapefile located on the folder
# shapes_bairros2016. In this folder, there will also be a wikipedia table with the metadata from the
# year 2010, to be updated with the new Census.
#
# The output will be two CSV files, both on the folder data.
#
######################################################################################################
import pandas as pd
import difflib
import fiona
#Lê dados
crimes_2021 = pd.read_csv('data/crimes_2021.csv', sep=';', encoding="ISO-8859-1")
crimes_2022 = pd.read_csv('data/crimes_2022.csv', sep=';', encoding="ISO-8859-1")
crimes_2023 = pd.read_csv('data/crimes_2023.csv', sep=';', encoding="ISO-8859-1")
crimes = pd.concat([crimes_2021, crimes_2022, crimes_2023], ignore_index=True)
#Drop colunas desnecessárias
crimes = crimes.drop(crimes.columns[10:], axis=1)
#Drop cidades que não são Porto Alegre
crimes = crimes[crimes['Municipio Fato'] == 'PORTO ALEGRE']
#Drop NA na coluna bairros. Converte coluna para lower
crimes = crimes[crimes['Bairro'].notna()]
#Lower case
crimes['Bairro'] = crimes['Bairro'].apply(lambda x: x.lower())
#Tira acentos
crimes['Bairro'] = crimes['Bairro'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
#Arruma alguns erros comuns de gramática
crimes['Bairro'] = crimes['Bairro'].str.replace('vl', 'vila')
crimes['Bairro'] = crimes['Bairro'].str.replace('sta', 'santa')
#Arruma alguns bairros errados, segundo o mapa utilizado para a análise
crimes['Bairro'] = crimes['Bairro'].str.replace('protasio alves', 'morro santana')
crimes['Bairro'] = crimes['Bairro'].str.replace('cais do porto', 'centro historico')
crimes['Bairro'] = crimes['Bairro'].str.replace('intercap', 'partenon')
crimes.loc[crimes['Bairro'] == 'centro', 'Bairro'] = 'centro historico'
#Upper case
crimes['Bairro'] = crimes['Bairro'].apply(lambda x: x.upper())
#Open shapefile containing neighborhoods names
porto_alegre = fiona.open("shapesbairros2016/Bairros_2016.shp")
bairros = list()
for i in porto_alegre:
bairros.append(i['properties']['NOME'])
#Finds neighborhood with closest name
crimes['Bairro2'] = crimes['Bairro'].apply(lambda x: difflib.get_close_matches(x, bairros, n=1))
#Saves errors. Errors occur when it can't find any neighborhood
crimes[crimes["Bairro2"].str.len() == 0].groupby('Bairro').count().to_csv('data/error.csv')
#Drop old column, drop rows without neighborhood
crimes['Bairro'] = crimes['Bairro2']
crimes = crimes.drop(columns='Bairro2')
crimes = crimes[crimes["Bairro"].str.len() != 0]
crimes["Bairro"] = crimes["Bairro"].apply(lambda x: x[0])
#Saves file
crimes.to_csv('data/processed_data.csv')
#Now, let's process the metadata
#Opens table
bairros_metadata = pd.read_csv('shapesbairros2016/Lista_de_bairros_de_Porto_Alegre_1.csv')
#Same process to fina closest names
bairros_metadata['Bairro'] = bairros_metadata['Bairro'].apply(lambda x: x.upper())
bairros_metadata['Bairro2'] = bairros_metadata['Bairro'].apply(lambda x: difflib.get_close_matches(x, bairros, n=1))
#Drop old column. Drop rows without name
bairros_metadata['Bairro'] = bairros_metadata['Bairro2']
bairros_metadata = bairros_metadata.drop(columns='Bairro2')
bairros_metadata = bairros_metadata[bairros_metadata["Bairro"].str.len() != 0]
bairros_metadata["Bairro"] = bairros_metadata["Bairro"].apply(lambda x: x[0])
#Saves data
bairros_metadata.to_csv('data/bairros_metadata.csv')