library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(readr)
library(readxl)
library(fs)
# library(here)
library(lubridate)
library(forcats)

DATA_DIR <- "data-raw"

pop <- list()

import_pre1980 <- function(file) {
  # cli::cat_bullet(file)
  year <- str_extract(file, "\\d{4}\\.csv")
  year <- as.integer(sub("\\.csv", "", year))
  skip <- if (year >= 1960) 8 else 9
  x <- read_csv(file, col_names = FALSE, skip = skip,
                col_types = cols_only(
                  X1 = col_guess(), X2 = col_guess(),
                  X3 = col_guess(), X4 = col_guess()

                ))
  colnames(x) <- c("Age", "Total", "Total Male", "Total Female")
  x <- x[1:max(which(x$Age %in% c("75+", "85+"))), ]
  stopifnot(nrow(x) %in% c(76L, 86L))
  x %>%
    mutate(
      Year = year,
      Age = sub("\\+", "", Age),
      Age = as.integer(Age)
    ) %>%
    select(Year, everything())
}

pop[["1900-1980"]] <- fs::dir_ls(
  here::here(DATA_DIR, "1900-1980")
) %>%
  map_dfr(import_pre1980)

import_1980_1990 <- function(file) {
  read_fwf(
    file,
    col_positions = fwf_positions(
      start = c(3, 5, 7, 12, 22, 32),
      end = c(4, 6, 9, 20, 30, 40),
      col_names = c("Month", "Year", "Age", "Total", "Total Male", "Total Female")
    ),
    skip = 0
  ) %>%
    filter(Age < 101, Month == 4, Year == min(Year)) %>%
    mutate(Year = Year + 1900)
}

pop[["1980-1990"]] <- fs::dir_ls(
  here::here(DATA_DIR, "1980-1990"),
  regexp = "TXT"
) %>%
  map_dfr(import_1980_1990)

import_1990_2000 <- function(file) {
  x <- read_csv(file, col_names = FALSE, skip = 3)
  colnames(x) <- c("Month", "Age", "Total", "Total Male", "Total Female")

  x %>%
    filter(
      !is.na(Total),
      !str_detect(Age, "All")
    ) %>%
    mutate(Month = lubridate::parse_date_time(Month, "mdy")) %>%
    filter(
      lubridate::month(Month) == 4
    ) %>%
    mutate(
      Year = lubridate::year(Month),
      Age = str_remove(Age, "\\+"),
      Age = as.integer(Age)
    ) %>%
    select(Year, everything(), -Month)
}

pop[["1990-2000"]] <- fs::dir_ls(
  here::here(DATA_DIR, "1990-2000"),
  regexp = "\\d{4}\\.csv"
) %>%
  map_dfr(import_1990_2000)

# pop[["2000-2010"]] <-  read_csv(
#   here::here(DATA_DIR, "2000-2010/us-est00int-alldata.csv")
# ) %>%
#   select(Month = MONTH, Year = YEAR, Age = AGE,
#          Total = TOT_POP, `Total Male` = TOT_MALE,
#          `Total Female` = TOT_FEMALE) %>%
#   filter(Year > 2000, Age <= 125)

# pop[["2000-2010"]] <-  readxl::read_xls(
#   here::here(DATA_DIR, "2000-2010/us-est00int-01.xls"),
#   sheet = "Sheet1"
# ) %>%
#   mutate(age_label = Age, Age = str_extract(Age, "\\d{1,3}"), Age = as.integer(Age) - str_detect(age_label, "Under") * 5) %>%
#   select(-age_label) %>%
#   gather(Year, value, -1:-2) %>%
#   spread(Group, value) %>%
#   mutate(Year = as.integer(Year)) %>%
#   filter(Year > 2000)

pop[["2000-2010"]] <-
  readr::read_csv(
    here::here(DATA_DIR, "2000-2010", "us-est00int-alldata-5yr.csv")
  ) %>%
    select(
      Year = year,
      Age = AGEGRP,
      Total = TOT_POP,
      `Total Male` = TOT_MALE,
      `Total Female` = TOT_FEMALE
    ) %>%
    filter(Age != 0, Year > 2000, Year < 2010) %>%
    mutate(Age = (Age - 1) * 5)

pop[["2010-2017"]] <- readr::read_csv(
  here::here(DATA_DIR, "2010-2017/PEP_2017_PEPSYASEXN_with_ann.csv"),
  skip = 1
) %>%
  select(-1:-4) %>%
  gather(key, value, -Sex) %>%
  filter(str_detect(key, "Population Estimate")) %>%
  mutate(key = str_remove(key, "Pop.+\\) - "), key = str_remove(key, " Total - ")) %>%
  filter(!str_detect(key, "Total|Median")) %>%
  separate(key, c("Year", "Age"), sep = ";") %>%
  mutate(
    Year = as.integer(Year),
    Age = sub("+", "", Age, fixed = TRUE),
    Age = as.integer(Age),
    Sex = recode(
      Sex,
      "Both Sexes" = "Total",
      "Female" = "Total Female",
      "Male" = "Total Male"
    )
  ) %>%
  filter(Year < 2016) %>%
  spread(Sex, value)

# pop[["2018"]] <- readr::read_tsv(
#   here::here(DATA_DIR, "national-population-projections_2014-2060.txt")
# ) %>%
#   select(-1) %>%
#   filter(!is.na(Year)) %>%
#   select(contains("Code"), contains("Pop")) %>%
#   {colnames(.) <- sub(" Code", "", colnames(.)); .} %>%
#   group_by(Year, Age) %>%
#   mutate(Total = sum(`Projected Populations`)) %>%
#   ungroup() %>%
#   spread(Gender, `Projected Populations`) %>%
#   mutate(
#     Age = sub("+", "", Age, fixed = TRUE),
#     Age = as.integer(Age)
#   ) %>%
#   filter(Year > 2017)

pop[["2018"]] <-
  readr::read_csv(
    here::here(DATA_DIR, "np2017_d1.csv")
  ) %>%
  filter(ORIGIN == 0, RACE == 0) %>%
  gather(Age, n, -SEX:-YEAR) %>%
  mutate(SEX = c("Total", "Total Male", "Total Female")[SEX + 1]) %>%
  spread(SEX, n) %>%
  filter(Age != "TOTAL_POP") %>%
  mutate(
    Age = sub("POP_", "", Age, fixed = TRUE),
    Age = as.integer(Age)
  ) %>%
  select(Year = YEAR, Age, starts_with("Total"))

fs::dir_create(here::here("data"))
saveRDS(pop, here::here("data", "pop_estimates_by_age.rds"))