From 4a47cc650e22d865c87cd4eee04f56724d94c126 Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Wed, 5 Mar 2014 04:56:13 -0800 Subject: [PATCH] Added twitter support. --- .../ripme/ripper/rippers/TwitterRipper.java | 282 ++++++++++++++++++ src/main/resources/rip.properties | 3 +- .../tst/ripper/rippers/TwitterRipperTest.java | 28 ++ 3 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java new file mode 100644 index 000000000..bb26285cc --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java @@ -0,0 +1,282 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.json.JSONTokener; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import com.rarchives.ripme.ripper.AbstractRipper; +import com.rarchives.ripme.utils.Utils; + +public class TwitterRipper extends AbstractRipper { + + private static final String DOMAIN = "twitter.com", + HOST = "twitter"; + private static final Logger logger = Logger.getLogger(TwitterRipper.class); + + private static final int MAX_REQUESTS = 2; + private static final int WAIT_TIME = 2000; + + // Base 64 of consumer key : consumer secret + private String authKey; + private String accessToken; + + private enum ALBUM_TYPE { + ACCOUNT, + SEARCH + } + private ALBUM_TYPE albumType; + private String searchText, accountName; + + public TwitterRipper(URL url) throws IOException { + super(url); + authKey = Utils.getConfigString("twitter.auth", null); + if (authKey == null) { + throw new IOException("Could not find twitter authentication key in configuration"); + } + } + + @Override + public boolean canRip(URL url) { + return url.getHost().endsWith(DOMAIN); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd + Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%]{1,}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + albumType = ALBUM_TYPE.SEARCH; + searchText = m.group(2); + return url; + } + p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9]{1,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + albumType = ALBUM_TYPE.ACCOUNT; + accountName = m.group(2); + return url; + } + throw new MalformedURLException("Expected username or search string in url: " + url); + } + + private void getAccessToken() throws IOException { + Document doc = Jsoup.connect("https://api.twitter.com/oauth2/token") + .ignoreContentType(true) + .header("Authorization", "Basic " + authKey) + .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") + .header("User-agent", "ripe and zipe") + .data("grant_type", "client_credentials") + .post(); + String body = doc.body().html().replaceAll(""", "\""); + try { + JSONObject json = new JSONObject(body); + accessToken = json.getString("access_token"); + return; + } catch (JSONException e) { + // Fall through + throw new IOException("Failure while parsing JSON: " + body, e); + } + } + + private void checkRateLimits(String resource, String api) throws IOException { + Document doc = Jsoup.connect("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource) + .ignoreContentType(true) + .header("Authorization", "Bearer " + accessToken) + .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") + .header("User-agent", "ripe and zipe") + .get(); + String body = doc.body().html().replaceAll(""", "\""); + try { + JSONObject json = new JSONObject(body); + JSONObject stats = json.getJSONObject("resources") + .getJSONObject(resource) + .getJSONObject(api); + int remaining = stats.getInt("remaining"); + logger.info(" Twitter " + resource + " calls remaining: " + remaining); + if (remaining < 20) { + logger.error("Twitter API calls exhausted: " + stats.toString()); + throw new IOException("Less than 20 API calls remaining; not enough to rip."); + } + } catch (JSONException e) { + logger.error("JSONException: ", e); + throw new IOException("Error while parsing JSON: " + body, e); + } + } + + private String getApiURL(String maxID) { + String req = ""; + switch (albumType) { + case ACCOUNT: + req = "https://api.twitter.com/1.1/statuses/user_timeline.json" + + "?screen_name=" + this.accountName + + "&include_entities=true" + + "&exclude_replies=true" + + "&trim_user=true" + + "&include_rts=false" + + "&count=" + 200; + break; + case SEARCH: + req = "https://api.twitter.com/1.1/search/tweets.json" + + "?q=" + this.searchText + + "&include_entities=true" + + "&result_type=recent" + + "&count=100"; + break; + } + if (maxID != null) { + req += "&max_id=" + maxID; + } + return req; + } + + private List getTweets(String url) throws IOException { + List tweets = new ArrayList(); + logger.info(" Retrieving " + url); + Document doc = Jsoup.connect(url) + .ignoreContentType(true) + .header("Authorization", "Bearer " + accessToken) + .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") + .header("User-agent", "ripe and zipe") + .get(); + String body = doc.body().html().replaceAll(""", "\""); + Object jsonObj = new JSONTokener(body).nextValue(); + JSONArray statuses; + if (jsonObj instanceof JSONObject) { + JSONObject json = (JSONObject) jsonObj; + if (json.has("errors")) { + String msg = json.getJSONObject("errors").getString("message"); + throw new IOException("Twitter responded with errors: " + msg); + } + statuses = json.getJSONArray("statuses"); + } else { + statuses = (JSONArray) jsonObj; + } + for (int i = 0; i < statuses.length(); i++) { + tweets.add((JSONObject) statuses.get(i)); + } + return tweets; + } + + private void parseTweet(JSONObject tweet) throws MalformedURLException { + if (!tweet.has("entities")) { + logger.error("XXX Tweet doesn't have entitites"); + return; + } + + JSONObject entities = tweet.getJSONObject("entities"); + + if (entities.has("media")) { + JSONArray medias = entities.getJSONArray("media"); + String url; + JSONObject media; + for (int i = 0; i < medias.length(); i++) { + media = (JSONObject) medias.get(i); + url = media.getString("media_url"); + if (url.contains(".twimg.com/")) { + url += ":large"; + } + addURLToDownload(new URL(url)); + } + } + + if (entities.has("urls")) { + JSONArray urls = entities.getJSONArray("urls"); + JSONObject url; + for (int i = 0; i < urls.length(); i++) { + url = (JSONObject) urls.get(i); + if (url.get("expanded_url") != null) { + handleTweetedURL(url.getString("url")); + } else { + handleTweetedURL(url.getString("expanded_url")); + } + } + } + } + + private void handleTweetedURL(String url) { + logger.error("[!] Need to handle URL: " + url); + } + + @Override + public void rip() throws IOException { + getAccessToken(); + + switch (albumType) { + case ACCOUNT: + checkRateLimits("statuses", "/statuses/user_timeline"); + break; + case SEARCH: + checkRateLimits("search", "/search/tweets"); + break; + } + + String maxID = null; + for (int i = 0; i < MAX_REQUESTS; i++) { + List tweets = getTweets(getApiURL(maxID)); + if (tweets.size() == 0) { + logger.info(" No more tweets found."); + break; + } + for (JSONObject tweet : tweets) { + maxID = tweet.getString("id_str"); + parseTweet(tweet); + } + + try { + Thread.sleep(WAIT_TIME); + } catch (InterruptedException e) { + logger.error("[!] Interrupted while waiting to load more results", e); + break; + } + } + + waitForThreads(); + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + switch (albumType) { + case ACCOUNT: + return "account_" + accountName; + case SEARCH: + StringBuilder gid = new StringBuilder(); + for (int i = 0; i < searchText.length(); i++) { + char c = searchText.charAt(i); + // Ignore URL-encoded chars + if (c == '%') { + gid.append('_'); + i += 2; + continue; + // Ignore non-alphanumeric chars + } else if ( + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + ) { + gid.append(c); + } + } + return "search_" + gid.toString(); + } + throw new MalformedURLException("Could not decide type of URL (search/account): " + url); + } + +} diff --git a/src/main/resources/rip.properties b/src/main/resources/rip.properties index c0c4c2144..2fbcf1134 100644 --- a/src/main/resources/rip.properties +++ b/src/main/resources/rip.properties @@ -1,3 +1,4 @@ threads.size = 5 file.overwrite = false -download.retries = 3 \ No newline at end of file +download.retries = 3 +twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8= \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java new file mode 100644 index 000000000..094c3f428 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java @@ -0,0 +1,28 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.TwitterRipper; + +public class TwitterRipperTest extends RippersTest { + + public void testTwitterAlbums() throws IOException { + List contentURLs = new ArrayList(); + //contentURLs.add(new URL("https://twitter.com/danngamber01/media")); + contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd")); + for (URL url : contentURLs) { + try { + TwitterRipper ripper = new TwitterRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + e.printStackTrace(); + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } +}