You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

182 line
4.9KB

  1. library(dplyr)
  2. library(purrr)
  3. library(tidyr)
  4. library(stringr)
  5. library(readr)
  6. library(readxl)
  7. library(fs)
  8. # library(here)
  9. library(lubridate)
  10. library(forcats)
  11. DATA_DIR <- "data-raw"
  12. pop <- list()
  13. import_pre1980 <- function(file) {
  14. # cli::cat_bullet(file)
  15. year <- str_extract(file, "\\d{4}\\.csv")
  16. year <- as.integer(sub("\\.csv", "", year))
  17. skip <- if (year >= 1960) 8 else 9
  18. x <- read_csv(file, col_names = FALSE, skip = skip,
  19. col_types = cols_only(
  20. X1 = col_guess(), X2 = col_guess(),
  21. X3 = col_guess(), X4 = col_guess()
  22. ))
  23. colnames(x) <- c("Age", "Total", "Total Male", "Total Female")
  24. x <- x[1:max(which(x$Age %in% c("75+", "85+"))), ]
  25. stopifnot(nrow(x) %in% c(76L, 86L))
  26. x %>%
  27. mutate(
  28. Year = year,
  29. Age = sub("\\+", "", Age),
  30. Age = as.integer(Age)
  31. ) %>%
  32. select(Year, everything())
  33. }
  34. pop[["1900-1980"]] <- fs::dir_ls(
  35. here::here(DATA_DIR, "1900-1980")
  36. ) %>%
  37. map_dfr(import_pre1980)
  38. import_1980_1990 <- function(file) {
  39. read_fwf(
  40. file,
  41. col_positions = fwf_positions(
  42. start = c(3, 5, 7, 12, 22, 32),
  43. end = c(4, 6, 9, 20, 30, 40),
  44. col_names = c("Month", "Year", "Age", "Total", "Total Male", "Total Female")
  45. ),
  46. skip = 0
  47. ) %>%
  48. filter(Age < 101, Month == 4, Year == min(Year)) %>%
  49. mutate(Year = Year + 1900)
  50. }
  51. pop[["1980-1990"]] <- fs::dir_ls(
  52. here::here(DATA_DIR, "1980-1990"),
  53. regexp = "TXT"
  54. ) %>%
  55. map_dfr(import_1980_1990)
  56. import_1990_2000 <- function(file) {
  57. x <- read_csv(file, col_names = FALSE, skip = 3)
  58. colnames(x) <- c("Month", "Age", "Total", "Total Male", "Total Female")
  59. x %>%
  60. filter(
  61. !is.na(Total),
  62. !str_detect(Age, "All")
  63. ) %>%
  64. mutate(Month = lubridate::parse_date_time(Month, "mdy")) %>%
  65. filter(
  66. lubridate::month(Month) == 4
  67. ) %>%
  68. mutate(
  69. Year = lubridate::year(Month),
  70. Age = str_remove(Age, "\\+"),
  71. Age = as.integer(Age)
  72. ) %>%
  73. select(Year, everything(), -Month)
  74. }
  75. pop[["1990-2000"]] <- fs::dir_ls(
  76. here::here(DATA_DIR, "1990-2000"),
  77. regexp = "\\d{4}\\.csv"
  78. ) %>%
  79. map_dfr(import_1990_2000)
  80. # pop[["2000-2010"]] <- read_csv(
  81. # here::here(DATA_DIR, "2000-2010/us-est00int-alldata.csv")
  82. # ) %>%
  83. # select(Month = MONTH, Year = YEAR, Age = AGE,
  84. # Total = TOT_POP, `Total Male` = TOT_MALE,
  85. # `Total Female` = TOT_FEMALE) %>%
  86. # filter(Year > 2000, Age <= 125)
  87. # pop[["2000-2010"]] <- readxl::read_xls(
  88. # here::here(DATA_DIR, "2000-2010/us-est00int-01.xls"),
  89. # sheet = "Sheet1"
  90. # ) %>%
  91. # mutate(age_label = Age, Age = str_extract(Age, "\\d{1,3}"), Age = as.integer(Age) - str_detect(age_label, "Under") * 5) %>%
  92. # select(-age_label) %>%
  93. # gather(Year, value, -1:-2) %>%
  94. # spread(Group, value) %>%
  95. # mutate(Year = as.integer(Year)) %>%
  96. # filter(Year > 2000)
  97. pop[["2000-2010"]] <-
  98. readr::read_csv(
  99. here::here(DATA_DIR, "2000-2010", "us-est00int-alldata-5yr.csv")
  100. ) %>%
  101. select(
  102. Year = year,
  103. Age = AGEGRP,
  104. Total = TOT_POP,
  105. `Total Male` = TOT_MALE,
  106. `Total Female` = TOT_FEMALE
  107. ) %>%
  108. filter(Age != 0, Year > 2000, Year < 2010) %>%
  109. mutate(Age = (Age - 1) * 5)
  110. pop[["2010-2017"]] <- readr::read_csv(
  111. here::here(DATA_DIR, "2010-2017/PEP_2017_PEPSYASEXN_with_ann.csv"),
  112. skip = 1
  113. ) %>%
  114. select(-1:-4) %>%
  115. gather(key, value, -Sex) %>%
  116. filter(str_detect(key, "Population Estimate")) %>%
  117. mutate(key = str_remove(key, "Pop.+\\) - "), key = str_remove(key, " Total - ")) %>%
  118. filter(!str_detect(key, "Total|Median")) %>%
  119. separate(key, c("Year", "Age"), sep = ";") %>%
  120. mutate(
  121. Year = as.integer(Year),
  122. Age = sub("+", "", Age, fixed = TRUE),
  123. Age = as.integer(Age),
  124. Sex = recode(
  125. Sex,
  126. "Both Sexes" = "Total",
  127. "Female" = "Total Female",
  128. "Male" = "Total Male"
  129. )
  130. ) %>%
  131. filter(Year < 2016) %>%
  132. spread(Sex, value)
  133. # pop[["2018"]] <- readr::read_tsv(
  134. # here::here(DATA_DIR, "national-population-projections_2014-2060.txt")
  135. # ) %>%
  136. # select(-1) %>%
  137. # filter(!is.na(Year)) %>%
  138. # select(contains("Code"), contains("Pop")) %>%
  139. # {colnames(.) <- sub(" Code", "", colnames(.)); .} %>%
  140. # group_by(Year, Age) %>%
  141. # mutate(Total = sum(`Projected Populations`)) %>%
  142. # ungroup() %>%
  143. # spread(Gender, `Projected Populations`) %>%
  144. # mutate(
  145. # Age = sub("+", "", Age, fixed = TRUE),
  146. # Age = as.integer(Age)
  147. # ) %>%
  148. # filter(Year > 2017)
  149. pop[["2018"]] <-
  150. readr::read_csv(
  151. here::here(DATA_DIR, "np2017_d1.csv")
  152. ) %>%
  153. filter(ORIGIN == 0, RACE == 0) %>%
  154. gather(Age, n, -SEX:-YEAR) %>%
  155. mutate(SEX = c("Total", "Total Male", "Total Female")[SEX + 1]) %>%
  156. spread(SEX, n) %>%
  157. filter(Age != "TOTAL_POP") %>%
  158. mutate(
  159. Age = sub("POP_", "", Age, fixed = TRUE),
  160. Age = as.integer(Age)
  161. ) %>%
  162. select(Year = YEAR, Age, starts_with("Total"))
  163. fs::dir_create(here::here("data"))
  164. saveRDS(pop, here::here("data", "pop_estimates_by_age.rds"))