You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.2KB

  1. #! /usr/bin/env Rscript
  2. # Usage -------------------------------------------------------------------
  3. 'Gather tweets from the command line
  4. Usage:
  5. gathertweet search [--file=<file>] [options] [--] <terms>...
  6. gathertweet update [--file=<file> --backup --polite --token --debug-args]
  7. Arguments
  8. <terms> Search terms. Individual search terms are queried separately,
  9. but duplicated tweets are removed from the stored results.
  10. Options:
  11. -h --help Show this screen.
  12. --file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
  13. -n, --n <n> Number of tweets to return [default: 18000]
  14. --type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
  15. --include_rts Logical indicating whether retweets should be included
  16. --geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
  17. --max_id <max_id> Return results with an ID less than (older than) or equal to max_id
  18. --since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
  19. automatically extracted from the existing tweets <file>, if it exists, and
  20. ignored when <max_id> is set. [default: last]
  21. --no-parse Disable parsing of the results
  22. --token <token> See {rtweet} for more information
  23. --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
  24. --quiet Disable printing of {rtweet} processing/retrieval messages
  25. --polite Only allow one process (search|update) to run at a time
  26. --backup Create a backup of existing tweet file before writing any new files
  27. --debug-args Print values of the arguments only
  28. ' -> doc
  29. library(docopt)
  30. args <- docopt(doc, version = paste('gathertweet version', packageVersion("gathertweet")))
  31. exit <- function(value = 0) q(save = "no", value)
  32. if (args$`--debug-args`) {
  33. str(args)
  34. saveRDS(args, "args.rds")
  35. exit()
  36. }
  37. library(gathertweet)
  38. if (args$polite) {
  39. lockfile <- paste0(".gathertweet_",
  40. digest::digest(args[c("file", "search", "update")]),
  41. ".lock")
  42. lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
  43. action <- names(Filter(isTRUE, args[c("search", "update")]))
  44. gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
  45. }
  46. # Search ------------------------------------------------------------------
  47. if (isTRUE(args$search)) {
  48. log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")
  49. max_id <- args[["max_id"]]
  50. since_id <- args[["since_id"]]
  51. since_id <- if (is.null(max_id)) {
  52. if (since_id == "last") {
  53. last_seen_tweet(file = args$file)
  54. } else since_id
  55. }
  56. if (!is.null(since_id)) log_info("Tweets from {since_id}")
  57. if (!is.null(max_id)) log_info("Tweets up to {max_id}")
  58. tweets <- rtweet::search_tweets2(
  59. q = args$terms,
  60. n = as.integer(args$n),
  61. type = args$type,
  62. include_rts = args$include_rts,
  63. geocode = args$geocode,
  64. max_id = max_id,
  65. parse = !args[["no-parse"]],
  66. token = args$token,
  67. retryonratelimit = args$retryonratelimit,
  68. verbose = !args$quiet,
  69. since_id = since_id
  70. )
  71. if (nrow(tweets) == 0) {
  72. log_info("No new tweets.")
  73. exit()
  74. }
  75. tweets <- tweets[order(tweets$status_id), ]
  76. tweets <- tweets[!duplicated(tweets$status_id), ]
  77. log_info("Gathered {nrow(tweets)} tweets")
  78. if (args$backup) backup_tweets(args$file)
  79. tweets <- save_tweets(tweets, args$file)
  80. log_info("Total of {nrow(tweets)} tweets in {args$file}")
  81. # Update ------------------------------------------------------------------
  82. } else if (isTRUE(args$update)) {
  83. logger("Updating tweets in {args$file}")
  84. tweets <- update_tweets(
  85. file = args$file,
  86. # passed to rtweet::lookup_statuses()
  87. parse = !args[["no-parse"]],
  88. token = args$token
  89. )
  90. log_debug("Status lookup returned {nrow(tweets)} tweets")
  91. if (args$backup) backup_tweets(args$file)
  92. tweets <- save_tweets(tweets, args$file)
  93. log_debug("Total of {nrow(tweets)} tweets in {args$file}")
  94. log_info("Tweet update complete")
  95. }
  96. if (args$polite) {
  97. filelock::unlock(lck)
  98. unlink(lockfile)
  99. }