Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

124 linhas
4.7KB

  1. #! /usr/bin/env Rscript
  2. # Usage -------------------------------------------------------------------
  3. 'Gather tweets from the command line
  4. Usage:
  5. gathertweet search [--file=<file>] [options] [--] <terms>...
  6. gathertweet update [--file=<file> --token=<token> --backup --polite --debug-args]
  7. Arguments
  8. <terms> Search terms. Individual search terms are queried separately,
  9. but duplicated tweets are removed from the stored results.
  10. Each search term counts against the 15 minute rate limit of 180
  11. searches, which can be avoided by manually joining search terms
  12. into a single query. WARNING: Wrap queries with spaces in
  13. \'single quotes\': double quotes are allowed inside single quotes only.
  14. Options:
  15. -h --help Show this screen.
  16. --file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
  17. -n, --n <n> Number of tweets to return [default: 18000]
  18. --type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
  19. --include_rts Logical indicating whether retweets should be included
  20. --geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
  21. --max_id <max_id> Return results with an ID less than (older than) or equal to max_id
  22. --since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
  23. automatically extracted from the existing tweets <file>, if it exists, and
  24. ignored when <max_id> is set. [default: last]
  25. --no-parse Disable parsing of the results
  26. --token <token> See {rtweet} for more information
  27. --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
  28. --quiet Disable printing of {rtweet} processing/retrieval messages
  29. --polite Only allow one process (search|update) to run at a time
  30. --backup Create a backup of existing tweet file before writing any new files
  31. --backup-dir <backup_dir> Location for backups, use "" for current directory. [default: backups]
  32. --debug-args Print values of the arguments only
  33. ' -> doc
  34. library(docopt)
  35. args <- docopt(doc, version = paste('gathertweet version', packageVersion("gathertweet")))
  36. exit <- function(value = 0) q(save = "no", value)
  37. if (args$`--debug-args`) {
  38. str(args)
  39. saveRDS(args, "args.rds")
  40. exit()
  41. }
  42. library(gathertweet)
  43. if (args$polite) {
  44. lockfile <- paste0(".gathertweet_",
  45. digest::digest(args[c("file", "search", "update")]),
  46. ".lock")
  47. lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
  48. action <- names(Filter(isTRUE, args[c("search", "update")]))
  49. gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
  50. }
  51. # Search ------------------------------------------------------------------
  52. if (isTRUE(args$search)) {
  53. log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")
  54. max_id <- args[["max_id"]]
  55. since_id <- args[["since_id"]]
  56. since_id <- if (is.null(max_id)) {
  57. if (since_id == "last") {
  58. last_seen_tweet(file = args$file)
  59. } else since_id
  60. }
  61. if (!is.null(since_id)) log_info("Tweets from {since_id}")
  62. if (!is.null(max_id)) log_info("Tweets up to {max_id}")
  63. tweets <- rtweet::search_tweets2(
  64. q = args$terms,
  65. n = as.integer(args$n),
  66. type = args$type,
  67. include_rts = args$include_rts,
  68. geocode = args$geocode,
  69. max_id = max_id,
  70. parse = !args[["no-parse"]],
  71. token = args$token,
  72. retryonratelimit = args$retryonratelimit,
  73. verbose = !args$quiet,
  74. since_id = since_id
  75. )
  76. if (nrow(tweets) == 0) {
  77. log_info("No new tweets.")
  78. exit()
  79. }
  80. tweets <- tweets[order(tweets$status_id), ]
  81. tweets <- tweets[!duplicated(tweets$status_id), ]
  82. log_info("Gathered {nrow(tweets)} tweets")
  83. if (args$backup) backup_tweets(args$file, backup_dir = args$backup_dir)
  84. tweets <- save_tweets(tweets, args$file)
  85. log_info("Total of {nrow(tweets)} tweets in {args$file}")
  86. # Update ------------------------------------------------------------------
  87. } else if (isTRUE(args$update)) {
  88. logger("Updating tweets in {args$file}")
  89. tweets <- update_tweets(
  90. file = args$file,
  91. # passed to rtweet::lookup_statuses()
  92. parse = !args[["no-parse"]],
  93. token = args$token
  94. )
  95. log_debug("Status lookup returned {nrow(tweets)} tweets")
  96. if (args$backup) backup_tweets(args$file, backup_dir = args$backup_dir)
  97. tweets <- save_tweets(tweets, args$file)
  98. log_debug("Total of {nrow(tweets)} tweets in {args$file}")
  99. log_info("Tweet update complete")
  100. }
  101. if (args$polite) {
  102. filelock::unlock(lck)
  103. unlink(lockfile)
  104. }