Explorar el Código

Add gathertweet simplify

simplify
Garrick Aden-Buie hace 7 años
padre
commit
6901f2e7d6
Se han modificado 4 ficheros con 76 adiciones y 10 borrados
  1. +1
    -0
      .Rbuildignore
  2. +1
    -0
      NAMESPACE
  3. +35
    -0
      R/tweet_io.R
  4. +39
    -10
      inst/gathertweet.R

+ 1
- 0
.Rbuildignore Ver fichero

^R/initial_functions\.R$
^LICENSE\.md$ ^LICENSE\.md$
^gathertweet\.Rproj$ ^gathertweet\.Rproj$
^\.Rproj\.user$ ^\.Rproj\.user$

+ 1
- 0
NAMESPACE Ver fichero

export(logger) export(logger)
export(read_tweets) export(read_tweets)
export(save_tweets) export(save_tweets)
export(simplify_tweets)
export(update_tweets) export(update_tweets)
import(filelock) import(filelock)
importFrom(dplyr,anti_join) importFrom(dplyr,anti_join)

+ 35
- 0
R/tweet_io.R Ver fichero

fs::file_copy(file, file_backup) fs::file_copy(file, file_backup)
} }


#' @export
simplify_tweets <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds"),
...,
.fields = NULL
) {
if (is.null(tweets)) tweets <- read_tweets(file)
if (is.null(tweets)) return(NULL)
.fields <- c(list(...), .fields)
if (length(.fields)) {
tweets %>% dplyr::select(!!!.fields)
} else {
dplyr::select(
tweets,
created_at,
status_id,
user_id,
screen_name,
text,
favorite_count,
retweet_count,
hashtags,
profile_url,
profile_image_url,
urls_expanded_url,
mentions_screen_name,
is_quote,
media_url,
urls_url
)
}
}

#' @export #' @export
update_tweets <- function( update_tweets <- function(
tweets = NULL, tweets = NULL,
if (!is.null(x$error)) { if (!is.null(x$error)) {
log_warn("Error downloading {dest}: {x$error}") log_warn("Error downloading {dest}: {x$error}")
} else x$result } else x$result
}

+ 39
- 10
inst/gathertweet.R Ver fichero

Usage: Usage:
gathertweet search [--file=<file>] [options] [--] <terms>... gathertweet search [--file=<file>] [options] [--] <terms>...
gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args] gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args]
gathertweet simplify [--file=<file> --output=<output> --debug-args --polite <fields>...]


Arguments Arguments
<terms> Search terms. Individual search terms are queried separately, <terms> Search terms. Individual search terms are queried separately,
into a single query. WARNING: Wrap queries with spaces in into a single query. WARNING: Wrap queries with spaces in
\'single quotes\': double quotes are allowed inside single quotes only. \'single quotes\': double quotes are allowed inside single quotes only.


<fields> Tweet fields that should be included. Default value will include
`status_id`, `created_at`, `user_id`, `screen_name`, `text`,
`favorite_count`, `retweet_count`, `is_quote`, `hashtags`,
`mentions_screen_name`, `profile_url`, `profile_image_url`,
`media_url`, `urls_url`, `urls_expanded_url`.

Options: Options:
-h --help Show this screen. -h --help Show this screen.
--file=<file> Name of RDS file where tweets are stored [default: tweets.rds] --file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. [default: last]
--no-parse Disable parsing of the results --no-parse Disable parsing of the results
--token <token> See {rtweet} for more information --token <token> See {rtweet} for more information
--retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets) --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
--backup Create a backup of existing tweet file before writing any new files --backup Create a backup of existing tweet file before writing any new files
--backup-dir <dir> Location for backups, use "" for current directory. [default: backups] --backup-dir <dir> Location for backups, use "" for current directory. [default: backups]
--debug-args Print values of the arguments only --debug-args Print values of the arguments only

search:
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. [default: last]
--and-simplify Create additional simplified tweet set with default values.
Run `gathertweet simplify` manually for more control.
simplify:
--output=<output> Output file, default is input file with `_simplified` appended to name.
' -> doc ' -> doc


library(docopt) library(docopt)
} }


library(gathertweet) library(gathertweet)
action <- names(Filter(isTRUE, args[c("search", "update")]))
action <- names(Filter(isTRUE, args[c("search", "update", "simplify")]))


if (args$polite) { if (args$polite) {
lockfile <- paste0(".gathertweet_", lockfile <- paste0(".gathertweet_",
digest::digest(args[c("file", "search", "update")]),
digest::digest(args[c("file", "search", "update", "simplify")]),
".lock") ".lock")
lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0) lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}") gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")


# Search ------------------------------------------------------------------ # Search ------------------------------------------------------------------
if (isTRUE(args$search)) { if (isTRUE(args$search)) {
if (args[["--and-simplify"]]) args$simplify <- TRUE


log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"") log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")




} }


if (isTRUE(args$simplify)) {
logger("Simplifying tweets in {args$file}")
tweets_simplified <- simplify_tweets(
tweets = NULL,
file = args$file,
.fields = args$fields
)
log_debug("Simplified {nrow(tweets_simplified)} tweets")
if (is.null(args$output)) {
args$output <- gathertweet:::path_add(args$file, append = "_simplified")
}
log_info("Saving simplified tweets to {args$output}")
tweets_simplfied <- save_tweets(tweets_simplified, args$output)
}

if (args$polite) { if (args$polite) {
filelock::unlock(lck) filelock::unlock(lck)
unlink(lockfile) unlink(lockfile)

Cargando…
Cancelar
Guardar