Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add files via upload #40

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions scripts/bra_00_boundaries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# set up ------------------------------------------------------------------
# Clear environment
rm(list = ls())

# Load necessary libraries
library(dplyr)
library(geobr)
library(sf)
library(lwgeom)
library(readxl)

ids <- read_excel("00_doc/Documentacao/Documentação/Divisão Territorial do Brasil/Unidades da Federação, Mesorregiões, microrregiões e municípios 2010_modif.xls")

setwd("01_data/01_raw/boundaries/")

# downloading -------------------------------------------------------------
nat <- read_country(year=2010) |>
st_write("nat.geojson")

reg <- read_region(year = 2010) |>
setNames(c("reg_code", "reg_name", "geom")) |>
st_write("reg.geojson", delete_dsn = TRUE)

uf <- read_state(year=2010) |>
setNames(c("uf_code", "uf_name", "reg_code", "reg_name", "geom")) |>
st_write("uf.geojson")

uf_codes <- unique(uf$abbrev_state) # get a vector of uf codes to download census tract files

uf <- read_state(year=2010) |>
select(-abbrev_state)

mesoreg <- read_meso_region(year=2010) |>
select(-abbrev_state) |>
setNames(c("uf_code", "uf_name", "mesoreg_id", "mesoreg_name", "geom")) |>
st_write("mesoreg.geojson")

microreg <- read_micro_region(year=2010) |>
select(-abbrev_state) |>
setNames(c("uf_code", "uf_name", "microreg_id", "microreg_name", "geom")) |>
st_write("microreg.geojson")

metro <- read_metro_area(year=2010) |>
select(-abbrev_state) |>
setNames(c("mun_id", "mun_name", "uf_code", "metro_name", "metro_type", "metro_legislation_date", "geom"))|>
st_write("metro.geojson")

mun_10 <- read_municipality(year=2010)
mun_20 <- read_municipality(year=2020)
mun <- read_municipality(year=2010) |>
select(-abbrev_state) |>
setNames(c("mun_id", "mun_name", "uf_code", "geom")) |>
st_write("mun.geojson")

wa <- read_weighting_area(year=2010) |>
select(-abbrev_state) |>
setNames(c("wa_id","mun_id", "mun_name", "uf_code", "reg_code", "reg_name","geom")) |>
st_write("wa.geojson")

# geo_levels_names --------------------------------------------------------
# Define a function to process each data frame
process_df <- function(df, select_cols = NULL) {
df <- df %>%
st_drop_geometry() %>%
as.data.frame() %>%
mutate(across(everything(), as.character))

if (!is.null(select_cols)) {
df <- df %>% select(all_of(select_cols))
}

return(df)
}

# Process each data frame
mun_names <- process_df(mun)
uf_names <- process_df(uf)
reg_names <- process_df(reg)
mesoreg_names <- process_df(mesoreg, c("mesoreg_id", "mesoreg_name"))
microreg_names <- process_df(microreg, c("microreg_id", "microreg_name"))
metro_names <- process_df(metro) %>% select(-mun_name)

# Join data frames
names0 <- full_join(ids, mun_names, by = c("uf_code", "mun_id"))
names <- full_join(uf_names, reg_names) %>%
full_join(names0) %>%
full_join(mesoreg_names) %>%
full_join(microreg_names) %>%
full_join(metro_names)

# Write to CSV
write.csv(names, "geo_levels_names_full.csv", row.names = FALSE, fileEncoding = "latin1")
# census tract ------------------------------------------------------------

# Create directories for census tract data
dir.create("tract_uf/", showWarnings = FALSE)

# Function to process and save census tract data for each state
process_census_tract <- function(state_code) {
current_state_data <- read_census_tract(
code_tract = state_code,
year = 2010,
showProgress = FALSE
) %>%
mutate(uf_code = substr(code_subdistrict, 1, 2)) %>%
select(code_state, code_muni, name_muni, code_tract) %>%
rename(uf_code = code_state,
tract_id = code_tract,
mun_id = code_muni,
mun_name = name_muni)

st_write(current_state_data,
paste0("01_data/01_raw/boundaries/tract_uf/", state_code, ".geojson"),
delete_dsn = TRUE)
}

# Process census tract data for each state
lapply(uf_codes, process_census_tract)

# Combine all state GeoJSON files into one
folder_path <- "tract_uf"
geojson_files <- list.files(path = folder_path, pattern = "\\.geojson$",
full.names = TRUE)
tract <- geojson_files %>%
lapply(st_read) %>%
bind_rows() %>%
select(-uf_code, -mun_name)

st_write(tract, "tract.geojson", delete_dsn = TRUE)
215 changes: 215 additions & 0 deletions scripts/bra_00_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@

# This code:
# > downloads 2010 Brazilian Census microdata from IBGE
# > unzip the microdata
# > reads .txt microdata into data frame
# > saves data sets as .csv files

# By, Rafael Pereira
# you can fund my contacts at https://sites.google.com/site/rafaelhenriquemoraespereira/
# 17-June-2016, Oxford, UK
# R version: RRO 3.2.2 (64 bits)


############################################################################
## ATTENTION: This is the only modification you have to do in this script, I hope ;)
rm(list=ls()) # Clear environment
setwd("01_data/01_raw") # set working Directory
############################################################################


##################### Load packages ----------------------------------------

# install devel version 1.9.7 of data.table
# GUIDELINES here : https://github.com/Rdatatable/data.table/wiki/Installation
library(data.table) # to manipulate data frames (fread and fwrite are ultrafast for reading and writing CSV files)
library(readr) #fast read of fixed witdh files
library(readxl) # read excel spreadsheets
options(scipen=999) # disable scientific notation
library(zip)
######## Download Census DATA -----------------------------------------------------

# create subdirectories where we'll save files
dir.create(file.path(".", "dados_txt2010"))
dir.create(file.path(".", "dados_csv2010"))


destfolder <- "./dados_txt2010/"
UFlist <- c("AC","AL","AM","AP","BA","CE","DF","ES","GO","MA","MG","MS","MT","PA","PB","PE","PI","PR","RJ","RN","RO","RR","RS","SC","SE","SP1","SP2_RM","TO")
ftppath <- "ftp://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados/"
tf <- tempfile()
td <- tempdir()

for (i in UFlist){
tf <- paste0(ftppath, i, ".zip")
td <- paste0("./dados_txt2010/", i, ".zip")
print(i)
download.file(tf, td, mode="wb")
}

# unzip all Files
filenames <- list.files("./dados_txt2010", pattern=".zip", full.names=TRUE)
lapply(filenames,unzip, exdir = "./dados_txt2010")

######## Download Census Documentation -----------------------------------------------------

file_url <- "ftp://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados/Documentacao.zip"
download.file(file_url,"Documentacao.zip", mode="wb")
zip::unzip("Documentacao.zip", exdir="documentacao2010", junkpaths=T)


######## Prepare Documentation files to read .txt -----------------------------------------------------

# Open variables layout from Excel file
dic_dom <- read_excel("./documentacao2010/Layout_microdados_Amostra.xls", sheet =1, skip = 1)
dic_pes <- read_excel("./documentacao2010/Layout_microdados_Amostra.xls", sheet =2, skip = 1)


# convert to data table
setDT(dic_dom)
setDT(dic_pes)


# compute width of each variable

# Create function to compute width
computeWidth <- function(dataset){dataset[is.na(DEC), DEC := 0] # Convert NA to 0
dataset[, width := INT + DEC] # create width variable
setnames(dataset,colnames(dataset)[which(colnames(dataset) == "POSIÇÃO INICIAL")],"pos.ini") # change name of variable initial position
setnames(dataset,colnames(dataset)[which(colnames(dataset) == "POSIÇÃO FINAL")],"pos.fin") # change name of variable final position
}

# Apply function
lapply(list(dic_dom,dic_pes), computeWidth)


### In case you need to work with a smaller subset of the data (e.g. because of memory limits),
### I would suggest you read from the .txt file only those variables you want
# myvariblesPES <- c("V0001", "V6400", "V0011", "V0300", "V0601", "V6036", "V0606", "V0010") # list of variables you want
# dic_pes <- dic_pes[VAR %in% myvariblesPES] # filter documentation, continue the code

### Alternatively, you could save the whole data set file as .csv and load only the variables you want
# individuals <- fread("pesBrasil.csv", select= myvariblesPES)


### HOUSEHOLD Files (13 minutes) ----------------------------------------------------
# readr and bind ALL .txt files one by one
# still quite fast
# memory intensive for big data sets
# Create the output directory if it does not exist
# Create the output directory if it does not exist
dir.create(file.path("dados_csv2010", "dom"), recursive = TRUE)


# List with all Household files
data_files <- list.files(path = "./dados_txt2010",
recursive = TRUE,
pattern = "Dom",
full.names = TRUE)

# Create function to read Household files
readDOM <- function(f) {
cat(f, "\n")
data <- read_fwf(f,
fwf_positions(dput(dic_dom[, pos.ini]),
dput(dic_dom[, pos.fin]),
col_names = dput(dic_dom[, VAR])),
progress = interactive())
return(data)
}

# Loop through each file and save individually by V0001 and V0002
for (f in data_files) {
temp <- readDOM(f)
setDT(temp)

# Update decimals in the data
var.decimals <- dic_dom[DEC > 0, ] # identify variables with decimals to update
var.decimals <- var.decimals[, .(VAR, DEC)]
var <- var.decimals$VAR # list of variables to update decimals

for (j in seq_along(var)) {
set(temp, j = var[j], value = as.numeric(temp[[var[j]]]) / 10^var.decimals[j, DEC])
}

# Save each subset of data by V0001 and V0002
unique_states <- unique(temp$V0001)
for (state in unique_states) {
state_data <- temp[V0001 == state]

# Create directory for the state if it doesn't exist
state_dir <- file.path("dados_csv2010", "dom", as.character(state))
if (!dir.exists(state_dir)) {
dir.create(state_dir, recursive = TRUE)
}

unique_municipalities <- unique(state_data$V0002)
for (municipality in unique_municipalities) {
municipality_data <- state_data[V0002 == municipality]
fwrite(municipality_data, file = file.path(state_dir, paste0(municipality, ".csv")))
}
}
}

rm(temp, readDOM); gc()

### INDIVIDUALS Files (41 minutes) ----------------------------------------------------
# readr each .txt files and save it appending to one single data set
# not as fast
# not memory intensive for big data sets
# Create the output directory if it does not exist
dir.create(file.path("dados_csv2010", "pes"), recursive = TRUE)

# List with all Household files
data_files <- list.files(path = "./dados_txt2010",
recursive = TRUE,
pattern = "Pes",
full.names = TRUE)

# Prepare documentation to update decimals in the data
var.decimals <- dic_pes[DEC > 0, ] # identify variables with decimals to update
var.decimals <- var.decimals[, .(VAR, DEC)]
var <- var.decimals$VAR # list of variables to update decimals


# Loop through each file and process
for (i in seq_along(data_files)) {

# Select state file
file <- data_files[i]

# Read data into temp
temp <- read_fwf(file,
fwf_positions(dput(dic_pes[, pos.ini]),
dput(dic_pes[, pos.fin]),
col_names = dput(dic_pes[, VAR])),
progress = interactive())
setDT(temp) # Set as Data Table

# Update decimals in the data
for (j in seq_along(var)) {
set(temp, j = var[j], value = as.numeric(temp[[var[j]]]) / 10^var.decimals[j, DEC])
}

# Save each subset of data by V0001 and V0002
unique_states <- unique(temp$V0001)
for (state in unique_states) {
state_data <- temp[V0001 == state]

# Create directory for the state if it doesn't exist
state_dir <- file.path("dados_csv2010", "pes", as.character(state))
if (!dir.exists(state_dir)) {
dir.create(state_dir, recursive = TRUE)
}

unique_municipalities <- unique(state_data$V0002)
for (municipality in unique_municipalities) {
municipality_data <- state_data[V0002 == municipality]
fwrite(municipality_data, file = file.path(state_dir, paste0(municipality, ".csv")))
}
}

cat("saving", i, "out of", length(data_files), file, "\n") # Update status of the loop
rm(temp); gc()
}