-
Notifications
You must be signed in to change notification settings - Fork 627
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1.0.17 - Added generic *chan ripper #8
- Loading branch information
Showing
4 changed files
with
180 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
107 changes: 107 additions & 0 deletions
107
src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
package com.rarchives.ripme.ripper.rippers; | ||
|
||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.log4j.Logger; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
|
||
import com.rarchives.ripme.ripper.AbstractRipper; | ||
|
||
public class ChanRipper extends AbstractRipper { | ||
|
||
private static final Logger logger = Logger.getLogger(ChanRipper.class); | ||
|
||
public ChanRipper(URL url) throws IOException { | ||
super(url); | ||
} | ||
|
||
@Override | ||
public String getHost() { | ||
String host = this.url.getHost(); | ||
host = host.substring(0, host.lastIndexOf('.')); | ||
if (host.contains(".")) { | ||
// Host has subdomain (www) | ||
host = host.substring(host.lastIndexOf('.') + 1); | ||
} | ||
String board = this.url.toExternalForm().split("/")[3]; | ||
return host + "_" + board; | ||
} | ||
|
||
@Override | ||
public boolean canRip(URL url) { | ||
// TODO Whitelist? | ||
return url.getHost().contains("chan") && url.toExternalForm().contains("/res/"); | ||
} | ||
|
||
/** | ||
* Reformat given URL into the desired format (all images on single page) | ||
*/ | ||
public URL sanitizeURL(URL url) throws MalformedURLException { | ||
return url; | ||
} | ||
|
||
@Override | ||
public String getGID(URL url) throws MalformedURLException { | ||
Pattern p; Matcher m; | ||
|
||
p = Pattern.compile("^.*chan.*\\.[a-z]{2,3}/[a-z]+/res/([0-9]+)(\\.html|\\.php)?.*$"); | ||
m = p.matcher(url.toExternalForm()); | ||
if (m.matches()) { | ||
return m.group(1); | ||
} | ||
|
||
throw new MalformedURLException( | ||
"Expected *chan URL formats: " | ||
+ "*chan.com/@/res/####.html" | ||
+ " Got: " + url); | ||
} | ||
|
||
@Override | ||
public void rip() throws IOException { | ||
Set<String> attempted = new HashSet<String>(); | ||
int index = 0; | ||
Pattern p; Matcher m; | ||
logger.info(" Retrieving " + this.url.toExternalForm()); | ||
Document doc = Jsoup.connect(this.url.toExternalForm()) | ||
.userAgent(USER_AGENT) | ||
.get(); | ||
for (Element link : doc.select("a")) { | ||
if (!link.hasAttr("href")) { | ||
continue; | ||
} | ||
if (!link.attr("href").contains("/src/")) { | ||
logger.debug("Skipping link that does not contain /src/: " + link.attr("href")); | ||
continue; | ||
} | ||
System.err.println("URL=" + link.attr("href")); | ||
p = Pattern.compile("^.*\\.(jpg|jpeg|png|gif)$", Pattern.CASE_INSENSITIVE); | ||
m = p.matcher(link.attr("href")); | ||
if (m.matches()) { | ||
String image = link.attr("href"); | ||
if (image.startsWith("//")) { | ||
image = "http:" + image; | ||
} | ||
if (image.startsWith("/")) { | ||
image = "http://" + this.url.getHost() + image; | ||
} | ||
if (attempted.contains(image)) { | ||
logger.debug("Already attempted: " + image); | ||
continue; | ||
} | ||
index += 1; | ||
addURLToDownload(new URL(image), String.format("%03d_", index)); | ||
attempted.add(image); | ||
} | ||
} | ||
waitForThreads(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package com.rarchives.ripme.tst.ripper.rippers; | ||
|
||
import java.io.IOException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import com.rarchives.ripme.ripper.rippers.ChanRipper; | ||
|
||
public class ChanRipperTest extends RippersTest { | ||
|
||
public void testChanURLFailures() throws IOException { | ||
List<URL> failURLs = new ArrayList<URL>(); | ||
// URLs that should not work | ||
for (URL url : failURLs) { | ||
try { | ||
new ChanRipper(url); | ||
fail("Instantiated ripper for URL that should not work: " + url); | ||
} catch (Exception e) { | ||
// Expected | ||
continue; | ||
} | ||
} | ||
} | ||
|
||
public void testChanURLPasses() throws IOException { | ||
List<URL> passURLs = new ArrayList<URL>(); | ||
// URLs that should work | ||
passURLs.add(new URL("http://desuchan.net/v/res/7034.html")); | ||
passURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); | ||
passURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); | ||
passURLs.add(new URL("http://7chan.org/gif/res/23795.html")); | ||
passURLs.add(new URL("http://unichan2.org/b/res/518004.html")); | ||
passURLs.add(new URL("http://xchan.pw/porn/res/437.html")); | ||
for (URL url : passURLs) { | ||
try { | ||
ChanRipper ripper = new ChanRipper(url); | ||
assert(ripper.canRip(url)); | ||
deleteDir(ripper.getWorkingDir()); | ||
} catch (Exception e) { | ||
fail("Failed to instantiate ripper for " + url); | ||
} | ||
} | ||
} | ||
|
||
public void testChanRipper() throws IOException { | ||
if (!DOWNLOAD_CONTENT) { | ||
return; | ||
} | ||
List<URL> contentURLs = new ArrayList<URL>(); | ||
// URLs that should return more than 1 image | ||
contentURLs.add(new URL("http://desuchan.net/v/res/7034.html")); | ||
contentURLs.add(new URL("http://boards.4chan.org/r/res/12225949")); | ||
contentURLs.add(new URL("http://boards.420chan.org/ana/res/75984.php")); | ||
contentURLs.add(new URL("http://7chan.org/gif/res/23795.html")); | ||
contentURLs.add(new URL("http://unichan2.org/b/res/518004.html")); | ||
contentURLs.add(new URL("http://xchan.pw/porn/res/437.html")); | ||
for (URL url : contentURLs) { | ||
try { | ||
ChanRipper ripper = new ChanRipper(url); | ||
ripper.rip(); | ||
assert(ripper.getWorkingDir().listFiles().length > 1); | ||
deleteDir(ripper.getWorkingDir()); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
fail("Error while ripping URL " + url + ": " + e.getMessage()); | ||
} | ||
} | ||
} | ||
|
||
} |