# Exercise 1: Core dplyr Operations - SOLUTION
# Module 3: Data Wrangling with dplyr
# ============================================================================

# Load required packages
library(dplyr)
library(data.table)
library(here)
library(lubridate)

# Load data from Intermediate folder
panel_vat <- fread(here("Data", "Intermediate", "panel_vat.csv"), cmd = FALSE)
panel_cit <- fread(here("Data", "Intermediate", "panel_cit.csv"), cmd = FALSE)

# Convert dates
panel_vat$declaration_date <- as.Date(panel_vat$declaration_date)
panel_cit$declaration_date <- as.Date(panel_cit$declaration_date)

# ============================================================================
# TASK 1: FILTERING
# ============================================================================

high_vat <- panel_vat %>%
  filter(vat_outputs > 30000)

nrow(high_vat)

# ============================================================================
# TASK 2: SELECTING
# ============================================================================

vat_selected <- panel_vat %>%
  select(firm_id, declaration_date, vat_outputs)

head(vat_selected)

# ============================================================================
# TASK 3: MUTATING
# ============================================================================

vat_with_flags <- panel_vat %>%
  mutate(
    net_vat = vat_outputs - vat_inputs,
    is_refund = if_else(net_vat < 0, TRUE, FALSE)
  )

head(vat_with_flags)

# ============================================================================
# TASK 4: ARRANGING
# ============================================================================

top_taxpayers <- panel_cit %>%
  arrange(desc(tax_paid)) %>%
  head(10)

top_taxpayers

# ============================================================================
# TASK 5: SUMMARIZING
# ============================================================================

cit_summary <- panel_cit %>%
  summarize(
    total_income = sum(taxable_income, na.rm = TRUE),
    avg_income = mean(taxable_income, na.rm = TRUE),
    median_income = median(taxable_income, na.rm = TRUE)
  )

cit_summary

# ============================================================================
# TASK 6: GROUPING
# ============================================================================

vat_by_year <- panel_vat %>%
  mutate(filing_year = year(declaration_date)) %>%
  group_by(filing_year) %>%
  summarize(
    avg_vat = mean(vat_outputs, na.rm = TRUE),
    num_declarations = n(),
    .groups = "drop"
  )

vat_by_year

# ============================================================================
# TASK 7: CHALLENGE
# ============================================================================

top_firms_2022 <- panel_vat %>%
  mutate(filing_year = year(declaration_date)) %>%
  filter(filing_year == 2022) %>%
  mutate(net_vat = vat_outputs - vat_inputs) %>%
  group_by(firm_id) %>%
  summarize(total_vat = sum(net_vat, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_vat)) %>%
  head(3)

top_firms_2022

# ============================================================================
# END OF EXERCISE 1 SOLUTION
# ============================================================================
