Просмотр исходного кода

Add gathertweet simplify

master
Garrick Aden-Buie 7 лет назад
Родитель
Сommit
fb6b293b80
5 измененных файлов: 142 добавлений и 13 удалений
  1. +1
    -0
      .Rbuildignore
  2. +2
    -0
      NAMESPACE
  3. +82
    -3
      R/tweet_io.R
  4. +41
    -10
      inst/gathertweet.R
  5. +16
    -0
      man/get_user_info.Rd

+ 1
- 0
.Rbuildignore Просмотреть файл

^R/initial_functions\.R$
^LICENSE\.md$ ^LICENSE\.md$
^gathertweet\.Rproj$ ^gathertweet\.Rproj$
^\.Rproj\.user$ ^\.Rproj\.user$

+ 2
- 0
NAMESPACE Просмотреть файл



export("%>%") export("%>%")
export(backup_tweets) export(backup_tweets)
export(get_user_info)
export(install_gathertweet) export(install_gathertweet)
export(last_seen_tweet) export(last_seen_tweet)
export(log_debug) export(log_debug)
export(logger) export(logger)
export(read_tweets) export(read_tweets)
export(save_tweets) export(save_tweets)
export(simplify_tweets)
export(update_tweets) export(update_tweets)
import(filelock) import(filelock)
importFrom(dplyr,anti_join) importFrom(dplyr,anti_join)

+ 82
- 3
R/tweet_io.R Просмотреть файл

file = getOption("gathertweet.file", "tweets.rds"), file = getOption("gathertweet.file", "tweets.rds"),
save_fun = saveRDS, save_fun = saveRDS,
read_fun = read_tweets, read_fun = read_tweets,
lck = NULL
lck = NULL,
key_var = "status_id"
) { ) {
if (nrow(tweets) < 1) return(tweets) if (nrow(tweets) < 1) return(tweets)
fs::dir_create(fs::path_dir(file)) fs::dir_create(fs::path_dir(file))
# Don't drop or lose old tweets # Don't drop or lose old tweets
tweets_prev <- read_fun(file, lck = lck) tweets_prev <- read_fun(file, lck = lck)
if (!is.null(tweets_prev)) { if (!is.null(tweets_prev)) {
tweets_not_new <- anti_join(tweets_prev, tweets, by = "status_id")
tweets_not_new <- anti_join(tweets_prev, tweets, by = key_var)
if (nrow(tweets_not_new)) { if (nrow(tweets_not_new)) {
tweets <- bind_rows(tweets, tweets_not_new) tweets <- bind_rows(tweets, tweets_not_new)
} }
if (length(setdiff(tweets_prev$status_id, tweets$status_id)) != 0) {
if (length(setdiff(tweets_prev[[key_var]], tweets[[key_var]])) != 0) {
log_fatal("An error occurred that would have lost stored tweets") log_fatal("An error occurred that would have lost stored tweets")
} }
} }
fs::file_copy(file, file_backup) fs::file_copy(file, file_backup)
} }


#' @export
simplify_tweets <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds"),
...,
.fields = NULL
) {
if (is.null(tweets)) tweets <- read_tweets(file)
if (is.null(tweets)) return(NULL)
.fields <- c(list(...), .fields)
if (length(.fields)) {
tweets %>% dplyr::select(!!!.fields)
} else {
dplyr::select(
tweets,
created_at,
status_id,
user_id,
screen_name,
text,
favorite_count,
retweet_count,
hashtags,
profile_url,
profile_image_url,
urls_expanded_url,
mentions_screen_name,
is_quote,
media_url,
urls_url
)
}
}

#' @export #' @export
update_tweets <- function( update_tweets <- function(
tweets = NULL, tweets = NULL,
exclusive_lock <- function(file, timeout = 1 * 60 * 1000) { exclusive_lock <- function(file, timeout = 1 * 60 * 1000) {
lock(path_lock(file), exclusive = TRUE, timeout = timeout) lock(path_lock(file), exclusive = TRUE, timeout = timeout)
} }

#' @title Get user info
#' @param file The file where tweets are located. The text `_users` is
#' automatically appended to this file name.
#' @export
get_user_info <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds"),
dir_profile_images = NULL
) {
if (is.null(tweets)) read_tweets(file)
user_file <- path_add(file, append = "_users")
users <- tweets %>%
rtweet::users_data() %>%
dplyr::distinct()

users <- save_tweets(users, user_file, key_var = "user_id")

if (!is.null(dir_profile_images)) {
rs <- lapply(users$profile_image_url, download_profile_images, output_dir = dir_profile_images)
}

return(users)
}

download_profile_images <- function(profile_image_url, ..., output_dir = "data") {
output_file <- sub("^.+?profile", "profile", profile_image_url)
output_file <- fs::path(output_dir, output_file)
fs::dir_create(fs::path_dir(output_file), recursive = TRUE)
download_file(profile_image_url, output_file)
}

download_file <- function(url, dest) {
if (fs::file_exists(dest)) return(dest)
x <- list(result = NULL, error = NULL)
x$result <- tryCatch({
download.file(url, dest)
dest
}, error = function(e) x$error <<- e$message)

if (!is.null(x$error)) {
log_warn("Error downloading {dest}: {x$error}")
} else x$result
}

+ 41
- 10
inst/gathertweet.R Просмотреть файл

Usage: Usage:
gathertweet search [--file=<file>] [options] [--] <terms>... gathertweet search [--file=<file>] [options] [--] <terms>...
gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args] gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args]
gathertweet simplify [--file=<file> --output=<output> --debug-args --polite <fields>...]


Arguments Arguments
<terms> Search terms. Individual search terms are queried separately, <terms> Search terms. Individual search terms are queried separately,
into a single query. WARNING: Wrap queries with spaces in into a single query. WARNING: Wrap queries with spaces in
\'single quotes\': double quotes are allowed inside single quotes only. \'single quotes\': double quotes are allowed inside single quotes only.


<fields> Tweet fields that should be included. Default value will include
`status_id`, `created_at`, `user_id`, `screen_name`, `text`,
`favorite_count`, `retweet_count`, `is_quote`, `hashtags`,
`mentions_screen_name`, `profile_url`, `profile_image_url`,
`media_url`, `urls_url`, `urls_expanded_url`.

Options: Options:
-h --help Show this screen. -h --help Show this screen.
--file=<file> Name of RDS file where tweets are stored [default: tweets.rds] --file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. "none" for all available tweets. [default: last]
--no-parse Disable parsing of the results --no-parse Disable parsing of the results
--token <token> See {rtweet} for more information --token <token> See {rtweet} for more information
--retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets) --retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
--backup Create a backup of existing tweet file before writing any new files --backup Create a backup of existing tweet file before writing any new files
--backup-dir <dir> Location for backups, use "" for current directory. [default: backups] --backup-dir <dir> Location for backups, use "" for current directory. [default: backups]
--debug-args Print values of the arguments only --debug-args Print values of the arguments only

search:
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. "none" for all available tweets. [default: last]
--and-simplify Create additional simplified tweet set with default values.
Run `gathertweet simplify` manually for more control.
simplify:
--output=<output> Output file, default is input file with `_simplified` appended to name.
' -> doc ' -> doc


library(docopt) library(docopt)
} }


library(gathertweet) library(gathertweet)
action <- names(Filter(isTRUE, args[c("search", "update")]))
action <- names(Filter(isTRUE, args[c("search", "update", "simplify")]))


if (args$polite) { if (args$polite) {
lockfile <- paste0(".gathertweet_", lockfile <- paste0(".gathertweet_",
digest::digest(args[c("file", "search", "update")]),
digest::digest(args[c("file", "search", "update", "simplify")]),
".lock") ".lock")
lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0) lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}") gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")


# Search ------------------------------------------------------------------ # Search ------------------------------------------------------------------
if (isTRUE(args$search)) { if (isTRUE(args$search)) {
if (args[["--and-simplify"]]) args$simplify <- TRUE


log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"") log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")




} }



# Simplify ----------------------------------------------------------------
if (isTRUE(args$simplify)) {
logger("Simplifying tweets in {args$file}")
tweets_simplified <- simplify_tweets(
tweets = NULL,
file = args$file,
.fields = args$fields
)
log_debug("Simplified {nrow(tweets_simplified)} tweets")
if (is.null(args$output)) {
args$output <- gathertweet:::path_add(args$file, append = "_simplified")
}
log_info("Saving simplified tweets to {args$output}")
tweets_simplfied <- save_tweets(tweets_simplified, args$output)
}

if (args$polite) { if (args$polite) {
filelock::unlock(lck) filelock::unlock(lck)
unlink(lockfile) unlink(lockfile)

+ 16
- 0
man/get_user_info.Rd Просмотреть файл

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tweet_io.R
\name{get_user_info}
\alias{get_user_info}
\title{Get user info}
\usage{
get_user_info(tweets = NULL, file = getOption("gathertweet.file",
"tweets.rds"), dir_profile_images = NULL)
}
\arguments{
\item{file}{The file where tweets are located. The text \code{_users} is
automatically appended to this file name.}
}
\description{
Get user info
}

Загрузка…
Отмена
Сохранить