Procházet zdrojové kódy

Initial commit

simplify
Garrick Aden-Buie před 7 roky
revize
38623206a4
19 změnil soubory, kde provedl 750 přidání a 0 odebrání
  1. +4
    -0
      .Rbuildignore
  2. +75
    -0
      .gitignore
  3. +28
    -0
      DESCRIPTION
  4. +2
    -0
      LICENSE
  5. +21
    -0
      LICENSE.md
  6. +22
    -0
      NAMESPACE
  7. +5
    -0
      R/gathertweet-package.R
  8. +23
    -0
      R/install.R
  9. +61
    -0
      R/logging.R
  10. +156
    -0
      R/tweet_io.R
  11. +11
    -0
      R/utils-pipe.R
  12. +57
    -0
      README.Rmd
  13. +76
    -0
      README.md
  14. +21
    -0
      gathertweet.Rproj
  15. +113
    -0
      inst/gathertweet.R
  16. +18
    -0
      man/gathertweet-package.Rd
  17. +16
    -0
      man/install_gathertweet.Rd
  18. +29
    -0
      man/logger.Rd
  19. +12
    -0
      man/pipe.Rd

+ 4
- 0
.Rbuildignore Zobrazit soubor

^LICENSE\.md$
^gathertweet\.Rproj$
^\.Rproj\.user$
^README\.Rmd$

+ 75
- 0
.gitignore Zobrazit soubor

# ---- Default .gitignore From grkmisc ----
.Rproj.user
.Rhistory
.RData
.DS_Store

# Directories that start with _
_*/

## https://github.com/github/gitignore/blob/master/R.gitignore
# History files
.Rhistory
.Rapp.history

# Session Data files
.RData

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
/*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/

## https://github.com/github/gitignore/blob/master/Global/macOS.gitignore
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

+ 28
- 0
DESCRIPTION Zobrazit soubor

Package: gathertweet
Title: Gather Tweets from the Command Line with rtweet
Version: 0.0.0.9000
Authors@R:
person(given = "Garrick",
family = "Aden-Buie",
role = c("aut", "cre"),
email = "garrick@adenbuie.com")
Description: A command line utility for common tweet gathering
tasks. Leverages the rtweet package for cron-friendly,
boilerplate-free tweet collection. Currently in progress and only
suitable for small search-based tweet collection projects.
License: MIT + file LICENSE
Imports:
digest,
docopt,
filelock,
fs,
futile.logger,
here,
magrittr,
rlang,
rtweet,
tibble
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1

+ 2
- 0
LICENSE Zobrazit soubor

YEAR: 2019
COPYRIGHT HOLDER: Garrick Aden-Buie

+ 21
- 0
LICENSE.md Zobrazit soubor

# MIT License

Copyright (c) 2019 Garrick Aden-Buie

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

+ 22
- 0
NAMESPACE Zobrazit soubor

# Generated by roxygen2: do not edit by hand

export("%>%")
export(backup_tweets)
export(install_gathertweet)
export(last_seen_tweet)
export(log_debug)
export(log_error)
export(log_fatal)
export(log_info)
export(log_pipe)
export(log_warn)
export(logger)
export(read_tweets)
export(save_tweets)
export(update_tweets)
import(filelock)
importFrom(fs,dir_exists)
importFrom(fs,file_exists)
importFrom(fs,path)
importFrom(magrittr,"%>%")
importFrom(tibble,tibble)

+ 5
- 0
R/gathertweet-package.R Zobrazit soubor

#' @keywords internal
#' @importFrom fs dir_exists file_exists path
#' @importFrom tibble tibble
#' @import filelock
"_PACKAGE"

+ 23
- 0
R/install.R Zobrazit soubor

#' Install gathertweet exectuable script
#'
#' Installs the `gatherwteet` executable script to the location. Should work
#' with Unix and MacOS out of the box, but I can't make any guarantees about
#' Windows.
#'
#' @param location Where to install the gathertweet executable script
#' @export
install_gathertweet <- function(
location = "/usr/local/bin"
) {
if (!dir_exists(location)) {
log_fatal("Location {location} does not exist")
}
if (!fs::file_access(location, "write")) {
log_fatal("You do not have write permissions for {location}")
}
log_info("Creating link to gathertweet at {location}/gathertweet")
fs::link_create(
system.file("gathertweet.R", package = "gathertweet"),
path(location, "gathertweet")
)
}

+ 61
- 0
R/logging.R Zobrazit soubor

.onLoad <- function(libname, pkgname) {
gathertweet_layout <- futile.logger::layout.format(
"[~t] [~l] ~m"
)
futile.logger::flog.layout(gathertweet_layout, name = "gathertweet")
}

#' @title Logging functions
#' @export
logger <- function(..., level = "info", envir = parent.frame()) {
msg <- glue::glue(..., .envir = envir)
futile_logger <- switch(
tolower(level),
"trace" = futile.logger::flog.trace,
"debug" = futile.logger::flog.debug,
"info" = futile.logger::flog.info,
"warn" = futile.logger::flog.warn,
"error" = futile.logger::flog.error,
"fatal" = futile.logger::flog.fatal,
futile.logger::flog.info
)
futile_logger(msg)
}

#' @rdname logger
#' @export
log_info <- function(..., envir = parent.frame()) {
logger(..., level = "info", envir = envir)
}

#' @rdname logger
#' @export
log_debug <- function(..., envir = parent.frame()) {
logger(..., level = "debug", envir = envir)
}

#' @rdname logger
#' @export
log_warn <- function(..., envir = parent.frame()) {
logger(..., level = "warn", envir = envir)
}

#' @rdname logger
#' @export
log_error <- function(..., envir = parent.frame()) {
logger(..., level = "error", envir = envir)
}

#' @rdname logger
#' @export
log_fatal <- function(..., envir = parent.frame()) {
logger(..., level = "fatal", envir = envir)
rlang::abort(glue::glue(..., .envir = envir))
}

#' @rdname logger
#' @export
log_pipe <- function(.data, ..., level = "info") {
msg <- glue::glue(...)
logger(msg, level = level)
}

+ 156
- 0
R/tweet_io.R Zobrazit soubor

#' @export
save_tweets <- function(
tweets,
file = getOption("gathertweet.file", "tweets.rds"),
save_fun = saveRDS,
read_fun = read_tweets,
lck = NULL
) {
if (nrow(tweets) < 1) return(tweets)
fs::dir_create(fs::path_dir(file))
if (is.null(lck)) {
lck <- exclusive_lock(file)
on.exit(unlock(lck))
}
stopifnot_locked(lck, message = "Unable to acquire lock on {file}")

if (fs::file_exists(file)) {
# Don't drop or lose old tweets
tweets_prev <- read_fun(file, lck = lck)
status_not_new <- setdiff(tweets_prev$status_id, tweets$status_id)
if (length(status_not_new)) {
tweets <- rbind(
tweets,
tweets_prev[tweets_prev$status_id %in% status_not_new, ]
)
}
stopifnot(length(setdiff(tweets_prev$status_id, tweets$status_id)) == 0)
}

save_fun(tweets, file)
tweets
}

#' @export
last_seen_tweet <- function(
tweets = NULL,
file = getOption("gathertweet.file", "tweets.rds")
) {
if (is.null(tweets)) tweets <- read_tweets(file)
if (is.null(tweets)) return(NULL)
tweets$status_id %>%
as.numeric() %>%
max() %>%
as.character()
}

#' @export
read_tweets <- function(
file = getOption("gathertweet.file", "tweets.rds"),
lck = NULL
) {
if (!file_exists(file)) return(NULL)
if (is.null(lck)) {
lck <- shared_lock(file)
on.exit(unlock(lck))
}
stopifnot_locked(lck, message = "Unable to acquire lock on {file}")

readRDS(file)
}

#' @export
backup_tweets <- function(
file = getOption("gathertweet.file", "tweets.rds"),
lck = NULL
) {
if (!file_exists(file)) return()
if (is.null(lck)) {
lck <- shared_lock(file)
on.exit(unlock(lck))
}
stopifnot_locked(lck, message = "Unable to acquire lock on {file}")
file_backup <- path_add(file)
log_info("Backing up tweet file to {file_backup}")
fs::file_copy(file, file_backup)
}

#' @export
update_tweets <- function(
tweets = NULL,
file = getOption("tweets.file", "tweets.rds"),
...
) {
if (is.null(tweets)) tweets <- read_tweets(file)
lookup_status_ratelimit(tweets$status_id, ...)
}

lookup_status_ratelimit <- function(status_id, ...) {
tweets <- NULL
rate_limit <- rtweet::rate_limits(query = "statuses/lookup")
fetch_count <- 0
n_status <- length(status_id)
n_status_large <- n_status > 90000
for (idx_group in seq(1, ceiling(n_status/90000))) {
# Rate limit ----
# Track rate limit and wait it out if needed
if (Sys.time() > rate_limit$reset_at) {
log_debug("Updating out-of-date rate limit")
rate_limit <- rtweet::rate_limits(query = "statuses/lookup")
}
if (rate_limit$remaining - fetch_count < 1) {
# wait until rate limit resets
wait_s <- difftime(Sys.time(), rate_limit$reset_at, units = "sec")
log_info("Waiting for rate limit to reset at {rate_limit$reset_at}")
Sys.sleep(ceiling(as.numeric(wait_s)))
}
if (fetch_count > 0 && fetch_count %% 50 == 0) {
rate_limit <- rtweet::rate_limits(query = "statuses/lookup")
}

# Get Statuses ----
if (n_status_large) {
idx_start <- (idx_group - 1) * 90000 + 1
idx_end <- min(idx_group * 90000, n_status)
log_info("Getting tweets {idx_start} to {idx_end} of {n_status}")
} else {
idx_start <- 1
idx_end <- n_status
log_info("Getting {n_status} tweets")
}
tweets <- rbind(
tweets,
rtweet::lookup_statuses(status_id[idx_start:idx_end])
)
}

tweets
}

path_lock <- function(file) {
path(path_add(file, NULL, prepend = "."), ext = "lock")
}

path_add <- function(file, append = strftime(Sys.time(), "_%F_%H%M%S"), prepend = NULL) {
if (is.null(append)) append <- ""
if (is.null(prepend)) prepend <- ""
file_base <- fs::path_ext_remove(fs::path_file(file))
file_ext <- fs::path_ext(file)
file_dir <- fs::path_dir(file)
path(file_dir,
glue::glue("{prepend}{file_base}{append}"),
ext = file_ext)
}

stopifnot_locked <- function(lck = NULL, message = "Unable to aquire lock") {
if (!is.null(lck)) return(invisible(TRUE))
log_error(message, envir = sys.frame(1))
}

shared_lock <- function(file, timeout = 1 * 60 * 1000) {
lock(path_lock(file), exclusive = FALSE, timeout = timeout)
}

exclusive_lock <- function(file, timeout = 1 * 60 * 1000) {
lock(path_lock(file), exclusive = TRUE, timeout = timeout)
}

+ 11
- 0
R/utils-pipe.R Zobrazit soubor

#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL

+ 57
- 0
README.Rmd Zobrazit soubor

---
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r setup, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "",
prompt = TRUE,
fig.path = "man/figures/README-",
out.width = "100%"
)
```
# gathertweet

The goal of gathertweet is to provide a simple command line utility that wraps key functions from [rtweet].

__gathertweet__ removes the boilerplate code required to run periodic Twitter searches and plays well with cron.

## Installation

This is a work in progress and may not work well for you yet.
But you are welcome to install **gathertweet** and try it out.

```r
# install.packages("remotes")
remotes::install_github("gadenbuie/gathertweet")
```

Once you've installed the package, you need to run

```r
gathertweet::install_gathertweet()
```

which adds `gathertweet` to `/usr/local/bin` as a symlink (you can adjust were this link is created).

## Example

```bash
# Get 100 #rstats tweets
gathertweet search --n 100 --quiet "#rtats"

# Get more tweets, automatically starting from end of the last search
gathertweet search --n 100 --quiet "#rstats"

# Update the stored data about those #rstats tweets
gathertweet update
```

## Documentation

```{bash}
gathertweet --help
```

+ 76
- 0
README.md Zobrazit soubor


<!-- README.md is generated from README.Rmd. Please edit that file -->

# gathertweet

The goal of gathertweet is to provide a simple command line utility that
wraps key functions from \[rtweet\].

**gathertweet** removes the boilerplate code required to run periodic
Twitter searches and plays well with cron.

## Installation

This is a work in progress and may not work well for you yet. But you
are welcome to install **gathertweet** and try it out.

``` r
# install.packages("remotes")
remotes::install_github("gadenbuie/gathertweet")
```

Once you’ve installed the package, you need to run

``` r
gathertweet::install_gathertweet()
```

which adds `gathertweet` to `/usr/local/bin` as a symlink (you can
adjust were this link is created).

## Example

``` bash
# Get 100 #rstats tweets
gathertweet search --n 100 --quiet "#rtats"

# Get more tweets, automatically starting from end of the last search
gathertweet search --n 100 --quiet "#rstats"

# Update the stored data about those #rstats tweets
gathertweet update
```

## Documentation

``` bash
> gathertweet --help
Gather tweets from the command line

Usage:
gathertweet search [--file=<file>] [options] [--] <terms>...
gathertweet update [--file=<file> --backup --polite --token --debug-args]

Arguments
<terms> Search terms. Individual search terms are queried separately,
but duplicated tweets are removed from the stored results.

Options:
-h --help Show this screen.
--file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. [default: last]
--no-parse Disable parsing of the results
--token <token> See {rtweet} for more information
--retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
--quiet Disable printing of {rtweet} processing/retrieval messages
--polite Only allow one process (search|update) to run at a time
--backup Create a backup of existing tweet file before writing any new files
--debug-args Print values of the arguments only
```

+ 21
- 0
gathertweet.Rproj Zobrazit soubor

Version: 1.0

RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

+ 113
- 0
inst/gathertweet.R Zobrazit soubor

#! /usr/bin/env Rscript

# Usage -------------------------------------------------------------------
'Gather tweets from the command line

Usage:
gathertweet search [--file=<file>] [options] [--] <terms>...
gathertweet update [--file=<file> --backup --polite --token --debug-args]

Arguments
<terms> Search terms. Individual search terms are queried separately,
but duplicated tweets are removed from the stored results.

Options:
-h --help Show this screen.
--file=<file> Name of RDS file where tweets are stored [default: tweets.rds]
-n, --n <n> Number of tweets to return [default: 18000]
--type <type> Type of search results: "recent", "mixed", or "popular". [default: recent]
--include_rts Logical indicating whether retweets should be included
--geocode <geocode> Geographical limiter of the template "latitude,longitude,radius"
--max_id <max_id> Return results with an ID less than (older than) or equal to max_id
--since_id <since_id> Return results with an ID greather than (newer than) or equal to since_id,
automatically extracted from the existing tweets <file>, if it exists, and
ignored when <max_id> is set. [default: last]
--no-parse Disable parsing of the results
--token <token> See {rtweet} for more information
--retryonratelimit Wait and retry when rate limited (only relevant when n exceeds 18000 tweets)
--quiet Disable printing of {rtweet} processing/retrieval messages
--polite Only allow one process (search|update) to run at a time
--backup Create a backup of existing tweet file before writing any new files
--debug-args Print values of the arguments only
' -> doc

library(docopt)
args <- docopt(doc, version = paste('gathertweet version', packageVersion("gathertweet")))
exit <- function(value = 0) q(save = "no", value)

if (args$`--debug-args`) {
str(args)
saveRDS(args, "args.rds")
exit()
}

library(gathertweet)

if (args$polite) {
lockfile <- paste0(".gathertweet_",
digest::digest(args[c("file", "search", "update")]),
".lock")
lck <- filelock::lock(lockfile, exclusive = TRUE, timeout = 0)
action <- names(Filter(isTRUE, args[c("search", "update")]))
gathertweet:::stopifnot_locked(lck, "Another gathertweet {action} process is currently running for {args$file}")
}

# Search ------------------------------------------------------------------
if (isTRUE(args$search)) {

log_info("Searching for \"{paste0(args$terms, collapse = '\", \"')}\"")

max_id <- args[["max_id"]]
since_id <- args[["since_id"]]
since_id <- if (is.null(max_id)) {
if (since_id == "last") {
last_seen_tweet(file = args$file)
} else since_id
}
if (!is.null(since_id)) log_info("Tweets from {since_id}")
if (!is.null(max_id)) log_info("Tweets up to {max_id}")

tweets <- rtweet::search_tweets2(
q = args$terms,
n = as.integer(args$n),
type = args$type,
include_rts = args$include_rts,
geocode = args$geocode,
max_id = max_id,
parse = !args[["no-parse"]],
token = args$token,
retryonratelimit = args$retryonratelimit,
verbose = !args$quiet,
since_id = since_id
)

if (nrow(tweets) == 0) {
log_info("No new tweets.")
exit()
}

tweets <- tweets[order(tweets$status_id), ]
tweets <- tweets[!duplicated(tweets$status_id), ]

log_info("Gathered {nrow(tweets)} tweets")
if (args$backup) backup_tweets(args$file)
tweets <- save_tweets(tweets, args$file)

log_info("Total of {nrow(tweets)} tweets in {args$file}")

# Update ------------------------------------------------------------------
} else if (isTRUE(args$update)) {
logger("Updating tweets in {args$file}")
tweets <- update_tweets(file = args$file)
log_debug("Status lookup returned {nrow(tweets)} tweets")
if (args$backup) backup_tweets(args$file)
tweets <- save_tweets(tweets, args$file)
log_debug("Total of {nrow(tweets)} tweets in {args$file}")
log_info("Tweet update complete")

}

if (args$polite) {
filelock::unlock(lck)
unlink(lockfile)
}

+ 18
- 0
man/gathertweet-package.Rd Zobrazit soubor

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/gathertweet-package.R
\docType{package}
\name{gathertweet-package}
\alias{gathertweet}
\alias{gathertweet-package}
\title{gathertweet: Gather Tweets from the Command Line with rtweet}
\description{
A command line utility for common tweet gathering
tasks. Leverages the rtweet package for cron-friendly,
boilerplate-free tweet collection. Currently in progress and only
suitable for small search-based tweet collection projects.
}
\author{
\strong{Maintainer}: Garrick Aden-Buie \email{garrick@adenbuie.com}

}
\keyword{internal}

+ 16
- 0
man/install_gathertweet.Rd Zobrazit soubor

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/install.R
\name{install_gathertweet}
\alias{install_gathertweet}
\title{Install gathertweet exectuable script}
\usage{
install_gathertweet(location = "/usr/local/bin")
}
\arguments{
\item{location}{Where to install the gathertweet executable script}
}
\description{
Installs the \code{gatherwteet} executable script to the location. Should work
with Unix and MacOS out of the box, but I can't make any guarantees about
Windows.
}

+ 29
- 0
man/logger.Rd Zobrazit soubor

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/logging.R
\name{logger}
\alias{logger}
\alias{log_info}
\alias{log_debug}
\alias{log_warn}
\alias{log_error}
\alias{log_fatal}
\alias{log_pipe}
\title{Logging functions}
\usage{
logger(..., level = "info", envir = parent.frame())

log_info(..., envir = parent.frame())

log_debug(..., envir = parent.frame())

log_warn(..., envir = parent.frame())

log_error(..., envir = parent.frame())

log_fatal(..., envir = parent.frame())

log_pipe(.data, ..., level = "info")
}
\description{
Logging functions
}

+ 12
- 0
man/pipe.Rd Zobrazit soubor

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils-pipe.R
\name{\%>\%}
\alias{\%>\%}
\title{Pipe operator}
\usage{
lhs \%>\% rhs
}
\description{
See \code{magrittr::\link[magrittr]{\%>\%}} for details.
}
\keyword{internal}

Načítá se…
Zrušit
Uložit