Kaynağa Gözat

Add gathertweet simplify

master
Garrick Aden-Buie 7 yıl önce
ebeveyn
işleme
fb6b293b80
5 değiştirilmiş dosya ile 142 ekleme ve 13 silme
  1. +1
    -0
      .Rbuildignore
  2. +2
    -0
      NAMESPACE
  3. +82
    -3
      R/tweet_io.R
  4. +41
    -10
      inst/gathertweet.R
  5. +16
    -0
      man/get_user_info.Rd

+ 1
- 0
.Rbuildignore Dosyayı Görüntüle

@@ -1,3 +1,4 @@
^R/initial_functions\.R$
^LICENSE\.md$
^gathertweet\.Rproj$
^\.Rproj\.user$

+ 2
- 0
NAMESPACE Dosyayı Görüntüle

@@ -2,6 +2,7 @@

export("%>%")
export(backup_tweets)
export(get_user_info)
export(install_gathertweet)
export(last_seen_tweet)
export(log_debug)
@@ -13,6 +14,7 @@ export(log_warn)
export(logger)
export(read_tweets)
export(save_tweets)
export(simplify_tweets)
export(update_tweets)
import(filelock)
importFrom(dplyr,anti_join)

+ 82
- 3
R/tweet_io.R Dosyayı Görüntüle

@@ -4,7 +4,8 @@ save_tweets <- function(
file = getOption("gathertweet.file", "tweets.rds"),
save_fun = saveRDS,
read_fun = read_tweets,
lck = NULL
lck = NULL,
key_var = "status_id"
) {
if (nrow(tweets) < 1) return(tweets)
fs::dir_create(fs::path_dir(file))
@@ -18,11 +19,11 @@ save_tweets <- function(
# Don't drop or lose old tweets
tweets_prev <- read_fun(file, lck = lck)
if (!is.null(tweets_prev)) {
tweets_not_new <- anti_join(tweets_prev, tweets, by = "status_id")
tweets_not_new <- anti_join(tweets_prev, tweets, by = key_var)
if (nrow(tweets_not_new)) {
tweets <- bind_rows(tweets, tweets_not_new)
}
if (length(setdiff(tweets_prev$status_id, tweets$status_id)) != 0) {
if (length(setdiff(tweets_prev[[key_var]], tweets[[key_var]])) != 0) {
log_fatal("An error occurred that would have lost stored tweets")
}
}
@@ -79,6 +80,40 @@ backup_tweets <- function(
fs::file_copy(file, file_backup)
}

#' @export
simplify_tweets <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds"),
...,
.fields = NULL
) {
if (is.null(tweets)) tweets <- read_tweets(file)
if (is.null(tweets)) return(NULL)
.fields <- c(list(...), .fields)
if (length(.fields)) {
tweets %>% dplyr::select(!!!.fields)
} else {
dplyr::select(
tweets,
created_at,
status_id,
user_id,
screen_name,
text,
favorite_count,
retweet_count,
hashtags,
profile_url,
profile_image_url,
urls_expanded_url,
mentions_screen_name,
is_quote,
media_url,
urls_url
)
}
}

#' @export
update_tweets <- function(
tweets = NULL,
@@ -158,3 +193,47 @@ shared_lock <- function(file, timeout = 1 * 60 * 1000) {
exclusive_lock <- function(file, timeout = 1 * 60 * 1000) {
lock(path_lock(file), exclusive = TRUE, timeout = timeout)
}

#' @title Get user info
#' @param file The file where tweets are located. The text `_users` is
#' automatically appended to this file name.
#' @export
get_user_info <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds"),
dir_profile_images = NULL
) {
if (is.null(tweets)) read_tweets(file)
user_file <- path_add(file, append = "_users")
users <- tweets %>%
rtweet::users_data() %>%
dplyr::distinct()

users <- save_tweets(users, user_file, key_var = "user_id")

if (!is.null(dir_profile_images)) {
rs <- lapply(users$profile_image_url, download_profile_images, output_dir = dir_profile_images)
}

return(users)
}

download_profile_images <- function(profile_image_url, ..., output_dir = "data") {
output_file <- sub("^.+?profile", "profile", profile_image_url)
output_file <- fs::path(output_dir, output_file)
fs::dir_create(fs::path_dir(output_file), recursive = TRUE)
download_file(profile_image_url, output_file)
}

download_file <- function(url, dest) {
if (fs::file_exists(dest)) return(dest)
x <- list(result = NULL, error = NULL)
x$result <- tryCatch({
download.file(url, dest)
dest
}, error = function(e) x$error <<- e$message)

if (!is.null(x$error)) {
log_warn("Error downloading {dest}: {x$error}")
} else x$result
}

+ 41
- 10
inst/gathertweet.R Dosyayı Görüntüle

@@ -6,6 +6,7 @@
Usage:
gathertweet search [--file=<file>] [options] [--] <terms>...
gathertweet update [--file=<file> --token=<token> --backup --backup-dir=<dir> --polite --debug-args]
gathertweet simplify [--file=<file> --output=<output> --debug-args --polite <fields>...]

Arguments
<terms> Search terms. Individual search terms are queried separately,
@@ -15,17 +16,15 @@ Arguments
into a single query. WARNING: Wrap queries with spaces in
\'single quotes\': double quotes are allowed inside single quotes only.

<fields> Tweet fields that should be included. Default value will include
`status_id`, `created_at`, `user_id`, `screen_name`, `text`,
`favorite_count`, `retweet_count`, `is_quote`, `hashtags`,
`mentions_screen_name`, `profile_url`, `profile_image_url`,
`media_url`, `urls_url`, `urls_expanded_url`.

Options:
-h --help Show this screen.
--file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. "none" for all available tweets. [default: last]
--no-parse Disable parsing of the results
--token <token> See {rtweet} for more information
--retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
@@ -34,6 +33,20 @@ Options:
--backup Create a backup of existing tweet file before writing any new files
--backup-dir <dir> Location for backups, use "" for current directory. [default: backups]
--debug-args Print values of the arguments only

search:
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. "none" for all available tweets. [default: last]
--and-simplify Create additional simplified tweet set with default values.
Run `gathertweet simplify` manually for more control.
simplify:
--output=<output> Output file, default is input file with `_simplified` appended to name.
' -> doc

library(docopt)
@@ -47,11 +60,11 @@ if (args$`--debug-args`) {
}

library(gathertweet)
action <- names(Filter(isTRUE, args[c("search", "update")]))
action <- names(Filter(isTRUE, args[c("search", "update", "simplify")]))

if (args$polite) {
lockfile <- paste0(".gathertweet_",
digest::digest(args[c("file", "search", "update")]),
digest::digest(args[c("file", "search", "update", "simplify")]),
".lock")
lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
@@ -61,6 +74,7 @@ log_info("---- gathertweet {action} start ----")

# Search ------------------------------------------------------------------
if (isTRUE(args$search)) {
if (args[["--and-simplify"]]) args$simplify <- TRUE

log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")

@@ -125,6 +139,23 @@ if (isTRUE(args$search)) {

}


# Simplify ----------------------------------------------------------------
if (isTRUE(args$simplify)) {
logger("Simplifying tweets in {args$file}")
tweets_simplified <- simplify_tweets(
tweets = NULL,
file = args$file,
.fields = args$fields
)
log_debug("Simplified {nrow(tweets_simplified)} tweets")
if (is.null(args$output)) {
args$output <- gathertweet:::path_add(args$file, append = "_simplified")
}
log_info("Saving simplified tweets to {args$output}")
tweets_simplfied <- save_tweets(tweets_simplified, args$output)
}

if (args$polite) {
filelock::unlock(lck)
unlink(lockfile)

+ 16
- 0
man/get_user_info.Rd Dosyayı Görüntüle

@@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tweet_io.R
\name{get_user_info}
\alias{get_user_info}
\title{Get user info}
\usage{
get_user_info(tweets = NULL, file = getOption("gathertweet.file",
"tweets.rds"), dir_profile_images = NULL)
}
\arguments{
\item{file}{The file where tweets are located. The text \code{_users} is
automatically appended to this file name.}
}
\description{
Get user info
}

Yükleniyor…
İptal
Kaydet