You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 line
4.7KB

  1. #! /usr/bin/env Rscript
  2. # Usage -------------------------------------------------------------------
  3. 'Gather tweets from the command line
  4. Usage:
  5. gathertweet search [--file=<file>] [options] [--] <terms>...
  6. gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args]
  7. Arguments
  8. <terms> Search terms. Individual search terms are queried separately,
  9. but duplicated tweets are removed from the stored results.
  10. Each search term counts against the 15 minute rate limit of 180
  11. searches, which can be avoided by manually joining search terms
  12. into a single query. WARNING: Wrap queries with spaces in
  13. \'single quotes\': double quotes are allowed inside single quotes only.
  14. Options:
  15. -h --help Show this screen.
  16. --file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
  17. -n, --n <n> Number of tweets to return [default: 18000]
  18. --type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
  19. --include_rts Logical indicating whether retweets should be included
  20. --geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
  21. --max_id <max_id> Return results with an ID less than (older than) or equal to max_id
  22. --since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
  23. automatically extracted from the existing tweets <file>, if it exists, and
  24. ignored when <max_id> is set. [default: last]
  25. --no-parse Disable parsing of the results
  26. --token <token> See {rtweet} for more information
  27. --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
  28. --quiet Disable printing of {rtweet} processing/retrieval messages
  29. --polite Only allow one process (search|update) to run at a time
  30. --backup Create a backup of existing tweet file before writing any new files
  31. --backup-dir <dir> Location for backups, use "" for current directory. [default: backups]
  32. --debug-args Print values of the arguments only
  33. ' -> doc
  34. library(docopt)
  35. args <- docopt(doc, version = paste('gathertweet version', packageVersion("gathertweet")))
  36. exit <- function(value = 0) q(save = "no", value)
  37. if (args$`--debug-args`) {
  38. str(args)
  39. saveRDS(args, "args.rds")
  40. exit()
  41. }
  42. library(gathertweet)
  43. action <- names(Filter(isTRUE, args[c("search", "update")]))
  44. if (args$polite) {
  45. lockfile <- paste0(".gathertweet_",
  46. digest::digest(args[c("file", "search", "update")]),
  47. ".lock")
  48. lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
  49. gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
  50. }
  51. log_info("---- gathertweet {action} start ----")
  52. # Search ------------------------------------------------------------------
  53. if (isTRUE(args$search)) {
  54. log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")
  55. max_id <- args[["max_id"]]
  56. since_id <- args[["since_id"]]
  57. since_id <- if (is.null(max_id)) {
  58. if (since_id == "last") {
  59. last_seen_tweet(file = args$file)
  60. } else since_id
  61. }
  62. if (!is.null(since_id)) log_info("Tweets from {since_id}")
  63. if (!is.null(max_id)) log_info("Tweets up to {max_id}")
  64. tweets <- rtweet::search_tweets2(
  65. q = args$terms,
  66. n = as.integer(args$n),
  67. type = args$type,
  68. include_rts = args$include_rts,
  69. geocode = args$geocode,
  70. max_id = max_id,
  71. parse = !args[["no-parse"]],
  72. token = args$token,
  73. retryonratelimit = args$retryonratelimit,
  74. verbose = !args$quiet,
  75. since_id = since_id
  76. )
  77. if (nrow(tweets) == 0) {
  78. log_info("No new tweets.")
  79. exit()
  80. }
  81. tweets <- tweets[order(tweets$status_id), ]
  82. tweets <- tweets[!duplicated(tweets$status_id), ]
  83. log_info("Gathered {nrow(tweets)} tweets")
  84. if (args$backup) backup_tweets(args$file, backup_dir = args[["backup-dir"]])
  85. tweets <- save_tweets(tweets, args$file)
  86. log_info("Total of {nrow(tweets)} tweets in {args$file}")
  87. # Update ------------------------------------------------------------------
  88. } else if (isTRUE(args$update)) {
  89. logger("Updating tweets in {args$file}")
  90. tweets <- update_tweets(
  91. file = args$file,
  92. # passed to rtweet::lookup_statuses()
  93. parse = !args[["no-parse"]],
  94. token = args$token
  95. )
  96. log_debug("Status lookup returned {nrow(tweets)} tweets")
  97. if (args$backup) backup_tweets(args$file, backup_dir = args[["backup-dir"]])
  98. tweets <- save_tweets(tweets, args$file)
  99. log_debug("Total of {nrow(tweets)} tweets in {args$file}")
  100. }
  101. if (args$polite) {
  102. filelock::unlock(lck)
  103. unlink(lockfile)
  104. }
  105. log_info("---- gathertweet {action} complete ----")