You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

207 satır
7.6KB

  1. #! /usr/bin/env Rscript
  2. # Usage -------------------------------------------------------------------
  3. 'Gather tweets from the command line
  4. Usage:
  5. gathertweet search [--file=<file>] [options] [--] <terms>...
  6. gathertweet timeline [options] [--] <users>...
  7. gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args]
  8. gathertweet simplify [--file=<file> --output=<output> --debug-args --polite] [<fields>...]
  9. Arguments
  10. <terms> Search terms. Individual search terms are queried separately,
  11. but duplicated tweets are removed from the stored results.
  12. Each search term counts against the 15 minute rate limit of 180
  13. searches, which can be avoided by manually joining search terms
  14. into a single query. WARNING: Wrap queries with spaces in
  15. \'single quotes\': double quotes are allowed inside single quotes only.
  16. <fields> Tweet fields that should be included. Default value will include
  17. `status_id`, `created_at`, `user_id`, `screen_name`, `text`,
  18. `favorite_count`, `retweet_count`, `is_quote`, `hashtags`,
  19. `mentions_screen_name`, `profile_url`, `profile_image_url`,
  20. `media_url`, `urls_url`, `urls_expanded_url`.
  21. Options:
  22. -h --help Show this screen.
  23. --file <file> Name of RDS file where tweets are stored [default: tweets.rds]
  24. --no-parse Disable parsing of the results
  25. --token <token> See {rtweet} for more information
  26. --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
  27. --quiet Disable printing of {rtweet} processing/retrieval messages
  28. --polite Only allow one process (search|update) to run at a time
  29. --backup Create a backup of existing tweet file before writing any new files
  30. --backup-dir <dir> Location for backups, use "" for current directory. [default: backups]
  31. --debug-args Print values of the arguments only
  32. --and-simplify Create additional simplified tweet set with default values.
  33. Run `gathertweet simplify` manually for more control.
  34. search and timeline:
  35. -n, --n <n> Number of tweets to return [default: 18000]
  36. --include_rts Logical indicating whether retweets should be included
  37. --max_id <max_id> Return results with an ID less than (older than) or equal to max_id
  38. search:
  39. --type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
  40. --geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
  41. --since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
  42. automatically extracted from the existing tweets <file>, if it exists, and
  43. ignored when <max_id> is set. Use "none" for all available tweets,
  44. or "last" for the maximum seen status_id in existing tweets. [default: last]
  45. timeline:
  46. --home If included, returns home-timeline instead of user-timeline.
  47. simplify:
  48. --output <output> Output file, default is input file with `_simplified` appended to name.
  49. ' -> doc
  50. library(docopt)
  51. args <- docopt(doc, version = paste('gathertweet version', packageVersion("gathertweet")))
  52. exit <- function(value = 0) q(save = "no", value)
  53. if (args$`--debug-args`) {
  54. str(args)
  55. saveRDS(args, "args.rds")
  56. exit()
  57. }
  58. library(gathertweet)
  59. collapse <- function(..., sep = ", ") paste(..., collapse = sep)
  60. # Which action was called?
  61. valid_actions <- c("search", "update", "simplify", "timeline")
  62. action <- names(Filter(isTRUE, args[valid_actions]))
  63. if (!length(action)) {
  64. log_fatal("Please specify a valid action: {collapse(valid_actions)}")
  65. }
  66. if (args$polite) {
  67. lockfile <- paste0(".gathertweet_",
  68. digest::digest(args[c("file", "search", "update", "simplify")]),
  69. ".lock")
  70. lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
  71. gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
  72. }
  73. log_info("---- gathertweet {action} start ----")
  74. # Search ------------------------------------------------------------------
  75. if (isTRUE(args$search)) {
  76. if (args[["--and-simplify"]]) args$simplify <- TRUE
  77. log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")
  78. max_id <- args[["max_id"]]
  79. since_id <- args[["since_id"]]
  80. since_id <- if (is.null(max_id)) {
  81. if (since_id == "last") {
  82. last_seen_tweet(file = args$file)
  83. } else if (since_id == "none") {
  84. NULL
  85. } else since_id
  86. }
  87. if (!is.null(since_id)) log_info("Tweets from {since_id}")
  88. if (!is.null(max_id)) log_info("Tweets up to {max_id}")
  89. tweets <- lapply(
  90. args$term,
  91. function(term) rtweet::search_tweets(
  92. q = term,
  93. n = as.integer(args$n),
  94. type = args$type,
  95. include_rts = args$include_rts,
  96. geocode = args$geocode,
  97. max_id = max_id,
  98. parse = !args[["no-parse"]],
  99. token = args$token,
  100. retryonratelimit = args$retryonratelimit,
  101. verbose = !args$quiet,
  102. since_id = since_id
  103. )
  104. )
  105. tweets <- dplyr::bind_rows(tweets)
  106. if (nrow(tweets) == 0) {
  107. log_info("No new tweets.")
  108. exit()
  109. }
  110. tweets <- tweets[!duplicated(tweets$status_id), ]
  111. tweets <- tweets[order(tweets$status_id), ]
  112. log_info("Gathered {nrow(tweets)} tweets")
  113. if (args$backup) backup_tweets(args$file, backup_dir = args[["backup-dir"]])
  114. tweets <- save_tweets(tweets, args$file)
  115. log_info("Total of {nrow(tweets)} tweets in {args$file}")
  116. # Update ------------------------------------------------------------------
  117. } else if (isTRUE(args$update)) {
  118. logger("Updating tweets in {args$file}")
  119. tweets <- update_tweets(
  120. file = args$file,
  121. # passed to rtweet::lookup_statuses()
  122. parse = !args[["no-parse"]],
  123. token = args$token
  124. )
  125. log_debug("Status lookup returned {nrow(tweets)} tweets")
  126. if (args$backup) backup_tweets(args$file, backup_dir = args[["backup-dir"]])
  127. tweets <- save_tweets(tweets, args$file)
  128. log_debug("Total of {nrow(tweets)} tweets in {args$file}")
  129. } else if (isTRUE(args$timeline)) {
  130. if (!length(args$users)) {
  131. stop("Please provide a list of users as user names, user IDs, or a mixture of both.")
  132. }
  133. log_info("Gathering tweets by {collapse(args$users)}")
  134. if (args[["--and-simplify"]]) args$simplify <- TRUE
  135. tweets <- rtweet::get_timeline(
  136. user = args[["users"]],
  137. n = min(as.integer(args[["n"]]), 3200),
  138. max_id = args[["max_id"]],
  139. home = isTRUE(args[["home"]]),
  140. parse = isFALSE(args[["no-parse"]]),
  141. check = TRUE,
  142. token = args$token,
  143. include_rts = isTRUE(args[["include-rts"]])
  144. )
  145. tweets <- tweets[!duplicated(tweets$status_id), ]
  146. tweets <- tweets[order(tweets$status_id), ]
  147. log_info("Gathered {nrow(tweets)} tweets from {length(args$users)} users")
  148. if (args$backup) backup_tweets(args$file, backup_dir = args[["backup-dir"]])
  149. tweets <- save_tweets(tweets, args$file)
  150. log_info("Total of {nrow(tweets)} tweets in {args$file}")
  151. }
  152. # Simplify ----------------------------------------------------------------
  153. if (isTRUE(args$simplify)) {
  154. logger("Simplifying tweets in {args$file}")
  155. tweets_simplified <- simplify_tweets(
  156. tweets = NULL,
  157. file = args$file,
  158. .fields = args$fields
  159. )
  160. log_debug("Simplified {nrow(tweets_simplified)} tweets")
  161. if (is.null(args$output)) {
  162. args$output <- gathertweet:::path_add(args$file, append = "_simplified")
  163. }
  164. log_info("Saving simplified tweets to {args$output}")
  165. tweets_simplfied <- save_tweets(tweets_simplified, args$output)
  166. }
  167. if (args$polite) {
  168. filelock::unlock(lck)
  169. unlink(lockfile)
  170. }
  171. log_info("---- gathertweet {action} complete ----")