# ==============================================================================
# Exercise 4.3 Solution: Diagnose and Fix a Broken Join
# Module: Data Reshaping and Joins
# ==============================================================================

library(tidyverse)
library(here)
library(data.table)

vat_data <- fread(here("r_training_datax", "Exercises", "data", "Intermediate", "panel_vat.csv"))
firm_data <- fread(here("r_training_datax", "Exercises", "data", "Intermediate", "dt_firms.csv"))

vat_enriched <- left_join(vat_data, firm_data, by = "firm_id")

nrow(vat_data)
nrow(vat_enriched)

firm_data %>%
  count(firm_id) %>%
  filter(n > 1)

vat_data %>%
  count(firm_id) %>%
  filter(n > 1)

vat_data <- vat_data %>%
  mutate(year = lubridate::year(declaration_date))

vat_enriched_fixed <- left_join(
  vat_data,
  firm_data,
  by = c("firm_id", "year")
)

nrow(vat_data)
nrow(vat_enriched_fixed)
nrow(vat_enriched_fixed) == nrow(vat_data)

vat_enriched_fixed %>%
  summarize(
    na_industry = sum(is.na(industry)),
    na_size = sum(is.na(size))
  )

vat_enriched_fixed %>%
  filter(firm_id == "FIRM_001") %>%
  select(firm_id, year, declaration_date, industry, size) %>%
  arrange(year)

fwrite(vat_enriched_fixed, here("r_training_datax", "Exercises", "data", "Intermediate", "vat_enriched.csv"))
