-
Notifications
You must be signed in to change notification settings - Fork 203
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
312 additions
and
1 deletion.
There are no files selected for viewing
282 changes: 282 additions & 0 deletions
282
src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
package com.rarchives.ripme.ripper.rippers; | ||
|
||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.log4j.Logger; | ||
import org.json.JSONArray; | ||
import org.json.JSONException; | ||
import org.json.JSONObject; | ||
import org.json.JSONTokener; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
|
||
import com.rarchives.ripme.ripper.AbstractRipper; | ||
import com.rarchives.ripme.utils.Utils; | ||
|
||
public class TwitterRipper extends AbstractRipper { | ||
|
||
private static final String DOMAIN = "twitter.com", | ||
HOST = "twitter"; | ||
private static final Logger logger = Logger.getLogger(TwitterRipper.class); | ||
|
||
private static final int MAX_REQUESTS = 2; | ||
private static final int WAIT_TIME = 2000; | ||
|
||
// Base 64 of consumer key : consumer secret | ||
private String authKey; | ||
private String accessToken; | ||
|
||
private enum ALBUM_TYPE { | ||
ACCOUNT, | ||
SEARCH | ||
} | ||
private ALBUM_TYPE albumType; | ||
private String searchText, accountName; | ||
|
||
public TwitterRipper(URL url) throws IOException { | ||
super(url); | ||
authKey = Utils.getConfigString("twitter.auth", null); | ||
if (authKey == null) { | ||
throw new IOException("Could not find twitter authentication key in configuration"); | ||
} | ||
} | ||
|
||
@Override | ||
public boolean canRip(URL url) { | ||
return url.getHost().endsWith(DOMAIN); | ||
} | ||
|
||
@Override | ||
public URL sanitizeURL(URL url) throws MalformedURLException { | ||
// https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd | ||
Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%]{1,}).*$"); | ||
Matcher m = p.matcher(url.toExternalForm()); | ||
if (m.matches()) { | ||
albumType = ALBUM_TYPE.SEARCH; | ||
searchText = m.group(2); | ||
return url; | ||
} | ||
p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9]{1,}).*$"); | ||
m = p.matcher(url.toExternalForm()); | ||
if (m.matches()) { | ||
albumType = ALBUM_TYPE.ACCOUNT; | ||
accountName = m.group(2); | ||
return url; | ||
} | ||
throw new MalformedURLException("Expected username or search string in url: " + url); | ||
} | ||
|
||
private void getAccessToken() throws IOException { | ||
Document doc = Jsoup.connect("https://api.twitter.com/oauth2/token") | ||
.ignoreContentType(true) | ||
.header("Authorization", "Basic " + authKey) | ||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") | ||
.header("User-agent", "ripe and zipe") | ||
.data("grant_type", "client_credentials") | ||
.post(); | ||
String body = doc.body().html().replaceAll(""", "\""); | ||
try { | ||
JSONObject json = new JSONObject(body); | ||
accessToken = json.getString("access_token"); | ||
return; | ||
} catch (JSONException e) { | ||
// Fall through | ||
throw new IOException("Failure while parsing JSON: " + body, e); | ||
} | ||
} | ||
|
||
private void checkRateLimits(String resource, String api) throws IOException { | ||
Document doc = Jsoup.connect("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource) | ||
.ignoreContentType(true) | ||
.header("Authorization", "Bearer " + accessToken) | ||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") | ||
.header("User-agent", "ripe and zipe") | ||
.get(); | ||
String body = doc.body().html().replaceAll(""", "\""); | ||
try { | ||
JSONObject json = new JSONObject(body); | ||
JSONObject stats = json.getJSONObject("resources") | ||
.getJSONObject(resource) | ||
.getJSONObject(api); | ||
int remaining = stats.getInt("remaining"); | ||
logger.info(" Twitter " + resource + " calls remaining: " + remaining); | ||
if (remaining < 20) { | ||
logger.error("Twitter API calls exhausted: " + stats.toString()); | ||
throw new IOException("Less than 20 API calls remaining; not enough to rip."); | ||
} | ||
} catch (JSONException e) { | ||
logger.error("JSONException: ", e); | ||
throw new IOException("Error while parsing JSON: " + body, e); | ||
} | ||
} | ||
|
||
private String getApiURL(String maxID) { | ||
String req = ""; | ||
switch (albumType) { | ||
case ACCOUNT: | ||
req = "https://api.twitter.com/1.1/statuses/user_timeline.json" | ||
+ "?screen_name=" + this.accountName | ||
+ "&include_entities=true" | ||
+ "&exclude_replies=true" | ||
+ "&trim_user=true" | ||
+ "&include_rts=false" | ||
+ "&count=" + 200; | ||
break; | ||
case SEARCH: | ||
req = "https://api.twitter.com/1.1/search/tweets.json" | ||
+ "?q=" + this.searchText | ||
+ "&include_entities=true" | ||
+ "&result_type=recent" | ||
+ "&count=100"; | ||
break; | ||
} | ||
if (maxID != null) { | ||
req += "&max_id=" + maxID; | ||
} | ||
return req; | ||
} | ||
|
||
private List<JSONObject> getTweets(String url) throws IOException { | ||
List<JSONObject> tweets = new ArrayList<JSONObject>(); | ||
logger.info(" Retrieving " + url); | ||
Document doc = Jsoup.connect(url) | ||
.ignoreContentType(true) | ||
.header("Authorization", "Bearer " + accessToken) | ||
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") | ||
.header("User-agent", "ripe and zipe") | ||
.get(); | ||
String body = doc.body().html().replaceAll(""", "\""); | ||
Object jsonObj = new JSONTokener(body).nextValue(); | ||
JSONArray statuses; | ||
if (jsonObj instanceof JSONObject) { | ||
JSONObject json = (JSONObject) jsonObj; | ||
if (json.has("errors")) { | ||
String msg = json.getJSONObject("errors").getString("message"); | ||
throw new IOException("Twitter responded with errors: " + msg); | ||
} | ||
statuses = json.getJSONArray("statuses"); | ||
} else { | ||
statuses = (JSONArray) jsonObj; | ||
} | ||
for (int i = 0; i < statuses.length(); i++) { | ||
tweets.add((JSONObject) statuses.get(i)); | ||
} | ||
return tweets; | ||
} | ||
|
||
private void parseTweet(JSONObject tweet) throws MalformedURLException { | ||
if (!tweet.has("entities")) { | ||
logger.error("XXX Tweet doesn't have entitites"); | ||
return; | ||
} | ||
|
||
JSONObject entities = tweet.getJSONObject("entities"); | ||
|
||
if (entities.has("media")) { | ||
JSONArray medias = entities.getJSONArray("media"); | ||
String url; | ||
JSONObject media; | ||
for (int i = 0; i < medias.length(); i++) { | ||
media = (JSONObject) medias.get(i); | ||
url = media.getString("media_url"); | ||
if (url.contains(".twimg.com/")) { | ||
url += ":large"; | ||
} | ||
addURLToDownload(new URL(url)); | ||
} | ||
} | ||
|
||
if (entities.has("urls")) { | ||
JSONArray urls = entities.getJSONArray("urls"); | ||
JSONObject url; | ||
for (int i = 0; i < urls.length(); i++) { | ||
url = (JSONObject) urls.get(i); | ||
if (url.get("expanded_url") != null) { | ||
handleTweetedURL(url.getString("url")); | ||
} else { | ||
handleTweetedURL(url.getString("expanded_url")); | ||
} | ||
} | ||
} | ||
} | ||
|
||
private void handleTweetedURL(String url) { | ||
logger.error("[!] Need to handle URL: " + url); | ||
} | ||
|
||
@Override | ||
public void rip() throws IOException { | ||
getAccessToken(); | ||
|
||
switch (albumType) { | ||
case ACCOUNT: | ||
checkRateLimits("statuses", "/statuses/user_timeline"); | ||
break; | ||
case SEARCH: | ||
checkRateLimits("search", "/search/tweets"); | ||
break; | ||
} | ||
|
||
String maxID = null; | ||
for (int i = 0; i < MAX_REQUESTS; i++) { | ||
List<JSONObject> tweets = getTweets(getApiURL(maxID)); | ||
if (tweets.size() == 0) { | ||
logger.info(" No more tweets found."); | ||
break; | ||
} | ||
for (JSONObject tweet : tweets) { | ||
maxID = tweet.getString("id_str"); | ||
parseTweet(tweet); | ||
} | ||
|
||
try { | ||
Thread.sleep(WAIT_TIME); | ||
} catch (InterruptedException e) { | ||
logger.error("[!] Interrupted while waiting to load more results", e); | ||
break; | ||
} | ||
} | ||
|
||
waitForThreads(); | ||
} | ||
|
||
@Override | ||
public String getHost() { | ||
return HOST; | ||
} | ||
|
||
@Override | ||
public String getGID(URL url) throws MalformedURLException { | ||
switch (albumType) { | ||
case ACCOUNT: | ||
return "account_" + accountName; | ||
case SEARCH: | ||
StringBuilder gid = new StringBuilder(); | ||
for (int i = 0; i < searchText.length(); i++) { | ||
char c = searchText.charAt(i); | ||
// Ignore URL-encoded chars | ||
if (c == '%') { | ||
gid.append('_'); | ||
i += 2; | ||
continue; | ||
// Ignore non-alphanumeric chars | ||
} else if ( | ||
(c >= 'a' && c <= 'z') | ||
|| (c >= 'A' && c <= 'Z') | ||
|| (c >= '0' && c <= '9') | ||
) { | ||
gid.append(c); | ||
} | ||
} | ||
return "search_" + gid.toString(); | ||
} | ||
throw new MalformedURLException("Could not decide type of URL (search/account): " + url); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
threads.size = 5 | ||
file.overwrite = false | ||
download.retries = 3 | ||
download.retries = 3 | ||
twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8= |
28 changes: 28 additions & 0 deletions
28
src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package com.rarchives.ripme.tst.ripper.rippers; | ||
|
||
import java.io.IOException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import com.rarchives.ripme.ripper.rippers.TwitterRipper; | ||
|
||
public class TwitterRipperTest extends RippersTest { | ||
|
||
public void testTwitterAlbums() throws IOException { | ||
List<URL> contentURLs = new ArrayList<URL>(); | ||
//contentURLs.add(new URL("https://twitter.com/danngamber01/media")); | ||
contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd")); | ||
for (URL url : contentURLs) { | ||
try { | ||
TwitterRipper ripper = new TwitterRipper(url); | ||
ripper.rip(); | ||
assert(ripper.getWorkingDir().listFiles().length > 1); | ||
deleteDir(ripper.getWorkingDir()); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
fail("Error while ripping URL " + url + ": " + e.getMessage()); | ||
} | ||
} | ||
} | ||
} |