diff --git a/release/doc/constant-values.html b/release/doc/constant-values.html index 3705f66..7cf88a4 100644 --- a/release/doc/constant-values.html +++ b/release/doc/constant-values.html @@ -86,7 +86,7 @@

net.pieroxy.*

public static final java.lang.String VERSION -"2015.04.12.08.04.GMT" +"2015.04.20.21.55.GMT" diff --git a/release/doc/index-all.html b/release/doc/index-all.html index 7cc3824..1cf6e56 100644 --- a/release/doc/index-all.html +++ b/release/doc/index-all.html @@ -499,7 +499,7 @@

U

 
UserAgentDetector - Class in net.pieroxy.ua.detection
-
This is the documentation for the version 2015.04.12.08.04.GMT of the library.
+
This is the documentation for the version 2015.04.20.21.55.GMT of the library.
UserAgentDetector() - Constructor for class net.pieroxy.ua.detection.UserAgentDetector
 
diff --git a/release/doc/net/pieroxy/ua/detection/BotFamily.html b/release/doc/net/pieroxy/ua/detection/BotFamily.html index 0a6eb10..3e973c4 100644 --- a/release/doc/net/pieroxy/ua/detection/BotFamily.html +++ b/release/doc/net/pieroxy/ua/detection/BotFamily.html @@ -134,16 +134,21 @@

Enum Constant Summary

+HIDDEN_BOT +
A robot that hides as a regular browser.
+ + + ROBOT
A generic robot.
- + SPAMBOT
A spam bot.
- + UNKNOWN
Unknown type of bot
@@ -233,6 +238,16 @@

SPAMBOT

A spam bot.
+ + + + diff --git a/release/doc/net/pieroxy/ua/detection/Brand.html b/release/doc/net/pieroxy/ua/detection/Brand.html index 94cbf78..b1c2a48 100644 --- a/release/doc/net/pieroxy/ua/detection/Brand.html +++ b/release/doc/net/pieroxy/ua/detection/Brand.html @@ -142,29 +142,35 @@

Enum Constant Summary

AMAZON  -APPLE  +APACHE  -ARCHOS  +APPLE  -ASK  +ARCHOS  -ASUS  +ASK  -AVANT  +ASUS  -BAIDU  +AVANT  -BANANAFISH  +BAIDU  +BANANAFISH  + + BE  + +BOULANGER  + CHROMIUM  @@ -415,48 +421,51 @@

Enum Constant Summary

SONY  -SUN  +SRWARE  -TENCENT  +SUN  -TOSHIBA  +TENCENT  -UNIXLIKE  +TOSHIBA  -UNKNOWN  +UNIXLIKE  -UNKNOWN_ANDROID  +UNKNOWN  -UTSTARCOM  +UNKNOWN_ANDROID  -VIVALDI  +UTSTARCOM  -WEBIN  +VIVALDI  -WIKO  +WEBIN  -WINDOWS  +WIKO  -YACI  +WINDOWS  -YAHOO  +YACI  -YANDEX  +YAHOO  +YANDEX  + + ZTE  @@ -690,6 +699,15 @@

ILEGEND

public static final Brand ILEGEND
+ + + + @@ -1158,6 +1176,15 @@

KYOCERA

public static final Brand KYOCERA
+ + + + @@ -1311,6 +1338,15 @@

OPENSOURCE

public static final Brand OPENSOURCE
+ + + + diff --git a/release/doc/net/pieroxy/ua/detection/UserAgentDetector.html b/release/doc/net/pieroxy/ua/detection/UserAgentDetector.html index 3efa24f..f54adfc 100644 --- a/release/doc/net/pieroxy/ua/detection/UserAgentDetector.html +++ b/release/doc/net/pieroxy/ua/detection/UserAgentDetector.html @@ -101,7 +101,7 @@

Class UserAgentDetector

public class UserAgentDetector
 extends java.lang.Object
 implements IUserAgentDetector
-
This is the documentation for the version 2015.04.12.08.04.GMT of the library.
+
This is the documentation for the version 2015.04.20.21.55.GMT of the library.
diff --git a/release/doc/net/pieroxy/ua/detection/package-summary.html b/release/doc/net/pieroxy/ua/detection/package-summary.html index 9430a63..f5708b1 100644 --- a/release/doc/net/pieroxy/ua/detection/package-summary.html +++ b/release/doc/net/pieroxy/ua/detection/package-summary.html @@ -135,7 +135,7 @@

Package net.pieroxy.ua.detection

UserAgentDetector -
This is the documentation for the version 2015.04.12.08.04.GMT of the library.
+
This is the documentation for the version 2015.04.20.21.55.GMT of the library.
diff --git a/release/user-agent-detector.jar b/release/user-agent-detector.jar index b8db52a..1b796e2 100644 Binary files a/release/user-agent-detector.jar and b/release/user-agent-detector.jar differ diff --git a/src/net/pieroxy/ua/detection/BotFamily.java b/src/net/pieroxy/ua/detection/BotFamily.java index 0c1f6a9..18a6d77 100644 --- a/src/net/pieroxy/ua/detection/BotFamily.java +++ b/src/net/pieroxy/ua/detection/BotFamily.java @@ -13,6 +13,10 @@ public enum BotFamily { /** */ SPAMBOT("Spam bot",true ), /** + * A robot that hides as a regular browser. There are considered to be nefarious (because they hide.) + */ + HIDDEN_BOT("Hidden bot",true ), + /** * A robot used to crawl the web (Google's crawler, Bing's crawler, etc.) */ CRAWLER("Web Crawler",false ), diff --git a/src/net/pieroxy/ua/detection/Brand.java b/src/net/pieroxy/ua/detection/Brand.java index c9ae762..35b5f36 100644 --- a/src/net/pieroxy/ua/detection/Brand.java +++ b/src/net/pieroxy/ua/detection/Brand.java @@ -22,6 +22,7 @@ public enum Brand { HAOSOU("Haosou", "http://www.haosou.com"), TENCENT("Tencent Holdings Limited", "http://www.tencent.com/en-us/index.shtml"), CLOUDMOSA("CloudMosa Inc", "http://www.cloudmosa.com/contact"), ILEGEND("iLegendSoft, Inc.", "http://www.ilegendsoft.com"), + BOULANGER("EssentielB", "http://www.essentielb.fr/"), LINKEDIN("LinkedIn", "http://www.linkedin.com"), BANANAFISH("Bananafish Software", "http://bananafishsoftware.com"), WEBIN("Webin", "http://webinhq.com"), @@ -74,6 +75,7 @@ public enum Brand { HAOSOU("Haosou", "http://www.haosou.com"), DELL("Dell", "http://www.dell.com"), COMPAQ("Compaq", "http://www.compaq.com"), KYOCERA("Kyocera", "http://global.kyocera.com"), + SRWARE("SRWare", "http://www.srware.net/en"), ALCATEL("Alcatel", "http://www.alcatelonetouch.com/global-en"), FUJITSU("Fujitsu", "http://www.fujitsu.com"), LOGICOM("Logicom", "http://www.logicom-europe.com"), @@ -91,6 +93,7 @@ public enum Brand { HAOSOU("Haosou", "http://www.haosou.com"), LUNASCAPE("Lunascape", "http://www.lunascape.tv"), CHROMIUM("The Chromium Project", "http://www.chromium.org"), OPENSOURCE("An Open Source Project"), + APACHE("Apache Software Foundation", "https://www.apache.org"), VIVALDI("Vivaldi Technologies", "https://vivaldi.com"), KDE("KDE", "http://www.kde.org"), ACCESSCO("Access Co. Ltd.", "http://www.access-company.com"), diff --git a/src/net/pieroxy/ua/detection/UserAgentDetector.java b/src/net/pieroxy/ua/detection/UserAgentDetector.java index ca0e7b3..cab7872 100644 --- a/src/net/pieroxy/ua/detection/UserAgentDetector.java +++ b/src/net/pieroxy/ua/detection/UserAgentDetector.java @@ -100,12 +100,41 @@ static Bot getBot(UserAgentContext context) { String ver; String[]multi; - if (context.ignore("ONDOWN3.2", MatchingType.EQUALS, MatchingRegion.PARENTHESIS)) { // Looks like a bot to me. + if (context.getUA().equals("Mozilla/0.6 Beta (Windows)") || context.getUA().equals("Mozilla/0.91 Beta (Windows)")) { + context.consumeAllTokens(); + return new Bot(Brand.UNKNOWN,BotFamily.HIDDEN_BOT,"",""); + } else if (context.consume("ONDOWN3.2", MatchingType.EQUALS, MatchingRegion.PARENTHESIS)) { // Looks like a bot to me. return new Bot(Brand.UNKNOWN,BotFamily.ROBOT,"ONDOWN","3.2"); } else if (context.consume("Google Web Preview", MatchingType.EQUALS, MatchingRegion.PARENTHESIS)) { context.consume("generic", MatchingType.EQUALS, MatchingRegion.PARENTHESIS); context.consume("iPhone", MatchingType.EQUALS, MatchingRegion.PARENTHESIS); return new Bot(Brand.GOOGLE, BotFamily.ROBOT,"Web Preview",""); + } else if (context.consume("Contact: backend@getprismatic.com", MatchingType.EQUALS, MatchingRegion.PARENTHESIS) || + (multi = context.getcNextTokens(new Matcher[] {new Matcher("Contact:", MatchingType.EQUALS), + new Matcher("feedback@getprismatic.com", MatchingType.EQUALS) + }, + MatchingRegion.REGULAR)) != null) { + return new Bot(Brand.OTHER, BotFamily.ROBOT,"Get Prismatic Bot","", "http://getprismatic.com/"); + } + else if ((ver=context.getcVersionAfterPattern("Diffbot/", MatchingType.BEGINS,MatchingRegion.BOTH))!=null || + (ver=context.getcVersionAfterPattern("diffbot/", MatchingType.BEGINS,MatchingRegion.REGULAR))!=null || + context.contains("+http://www.diffbot.com", MatchingType.BEGINS,MatchingRegion.PARENTHESIS)) { + return new Bot(Brand.OTHER, BotFamily.ROBOT,"Diffbot ", ver==null?"":ver, consumeUrlAndMozilla(context, "http://")); + } else if ((ver=context.getcVersionAfterPattern("oBot/", MatchingType.BEGINS,MatchingRegion.BOTH))!=null) { + return new Bot(Brand.IBM, BotFamily.ROBOT,"oBot ", ver, consumeUrlAndMozilla(context, "http://")); + } else if ((ver=context.getcVersionAfterPattern("yoozBot-", MatchingType.BEGINS,MatchingRegion.PARENTHESIS))!=null) { + context.consume("[0-9a-zA-Z\\.]+@[0-9a-zA-Z\\.]+", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); + return new Bot(Brand.OTHER, BotFamily.CRAWLER,"Yooz Bot ", ver, consumeUrlAndMozilla(context, "http://")); + } else if ((ver=context.getcVersionAfterPattern("GWPImages/", MatchingType.BEGINS,MatchingRegion.PARENTHESIS))!=null) { + return new Bot(Brand.OTHER, BotFamily.ROBOT,"GWPImages ", ver, consumeUrlAndMozilla(context, "http://")); + } else if ((ver=context.getcVersionAfterPattern("BLEXBot/", MatchingType.BEGINS,MatchingRegion.PARENTHESIS))!=null) { + return new Bot(Brand.OTHER, BotFamily.ROBOT,"BLEX Bot ", ver, consumeUrlAndMozilla(context, "http://")); + } else if ((ver=context.getcVersionAfterPattern("LSSRocketCrawler/", MatchingType.BEGINS,MatchingRegion.REGULAR))!=null) { + context.consume("LightspeedSystems", MatchingType.EQUALS, MatchingRegion.REGULAR); + return new Bot(Brand.OTHER, BotFamily.ROBOT,"LSSRocketCrawler ", ver); + } else if ((ver=context.getcVersionAfterPattern("OrangeBot/", MatchingType.BEGINS,MatchingRegion.PARENTHESIS))!=null) { + context.consume("[0-9a-zA-Z\\.]+@[0-9a-zA-Z\\.]+", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); + return new Bot(Brand.ORANGE, BotFamily.CRAWLER,"Orange Bot ", ver, consumeUrlAndMozilla(context, "http://")); } else if ((ver=context.getcVersionAfterPattern("del.icio.us-thumbnails/", MatchingType.BEGINS,MatchingRegion.BOTH))!=null) { return new Bot(Brand.DELICIOUS, BotFamily.ROBOT,"Thumbnails crawler ", ver); } else if ((ver=context.getcVersionAfterPattern("EvoHtmlToPdf/", MatchingType.BEGINS,MatchingRegion.REGULAR))!=null) { @@ -116,6 +145,11 @@ static Bot getBot(UserAgentContext context) { } context.consume("Unknown", MatchingType.EQUALS,MatchingRegion.PARENTHESIS); return new Bot(Brand.OPENSOURCE,BotFamily.ROBOT,"PhantomJS", ver); + } else if (context.consume("theoldreader.com", MatchingType.EQUALS, MatchingRegion.PARENTHESIS)) { + context.consume("feed-id=", MatchingType.BEGINS, MatchingRegion.PARENTHESIS); + context.consume("[0-9]+ subscribers", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); + consumeUrlAndMozilla(context, "http://"); + return new Bot(Brand.GOOGLE,BotFamily.FEED_CRAWLER,"RSS Feed Fetcher","","http://theoldreader.com/"); } else if (context.consume("Feedfetcher-Google;", MatchingType.EQUALS, MatchingRegion.REGULAR)) { context.consume("feed-id=", MatchingType.BEGINS, MatchingRegion.PARENTHESIS); context.consume("[0-9]+ subscribers", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); @@ -480,8 +514,7 @@ else if (context.consume("spbot/",MatchingType.BEGINS, MatchingRegion.PARENTHESI } return new Bot(Brand.OTHER, BotFamily.ROBOT, "360 Spider", ""); } else if ((ver=context.getcVersionAfterPattern("FlipboardProxy/",MatchingType.BEGINS, MatchingRegion.PARENTHESIS))!=null) { - context.consume("+http://flipboard.com/", MatchingType.BEGINS, MatchingRegion.PARENTHESIS); - return new Bot(Brand.OTHER, BotFamily.ROBOT, "Flipboard Proxy", ver); + return new Bot(Brand.OTHER, BotFamily.ROBOT, "Flipboard Proxy", ver, consumeUrlAndMozilla(context,"http://")); } else if (context.consume("Exabot/",MatchingType.BEGINS, MatchingRegion.BOTH) || context.consume("Exabot-Images/",MatchingType.BEGINS, MatchingRegion.BOTH) || context.consume("Exabot-Test/",MatchingType.BEGINS, MatchingRegion.BOTH)) { context.consume("BiggerBetter", MatchingType.BEGINS, MatchingRegion.PARENTHESIS); return new Bot(Brand.EXALEAD, BotFamily.CRAWLER, "Exalead crawler", "", consumeUrlAndMozilla(context,"http://")); @@ -1300,28 +1333,32 @@ static Browser tryOpera(UserAgentContext context) { } static float tryParseVersionNumber(String s) { - StringBuilder sb = new StringBuilder(20); - int status = 0; - for (int i=0 ; i0) + ver = ver.substring(ver.indexOf(" ruby ")+6); + String rver = ver; + + res.browser.family = BrowserFamily.LIBRARY; + res.browser.vendor = Brand.OPENSOURCE; + res.browser.description = "Ruby"; + res.operatingSystem = new OS(Brand.UNKNOWN,OSFamily.UNKNOWN,"",""); + + if ((ver=context.getcVersionAfterPattern("Mechanize/",MatchingType.BEGINS, MatchingRegion.REGULAR)) != null) { + context.consume("http://",MatchingType.BEGINS, MatchingRegion.PARENTHESIS); + res.browser.description = "Mechanize (Ruby)"; + } else if ((ver=context.getcVersionAfterPattern("HTTPClient/",MatchingType.BEGINS, MatchingRegion.REGULAR)) != null) { + context.consume("[0-9]{4}-[0-9]{2}-[0-9]{2}",MatchingType.REGEXP, MatchingRegion.PARENTHESIS); + res.browser.description = "HTTPClient (Ruby"+(rver!=null?" "+rver:"")+")"; + } else if (context.consume("Atig::Http/",MatchingType.BEGINS, MatchingRegion.REGULAR)) { + if (context.consume("arm-linux.*", MatchingType.REGEXP, MatchingRegion.PARENTHESIS)) { + res.device.architecture = "arm"; + res.operatingSystem.family = OSFamily.LINUX; + res.operatingSystem.description = "Linux"; + } else if (context.consume("i386-linux.*", MatchingType.REGEXP, MatchingRegion.PARENTHESIS)) { + res.device.architecture = "i386"; + res.operatingSystem.family = OSFamily.LINUX; + res.operatingSystem.description = "Linux"; + } else if (context.consume("i686-linux.*", MatchingType.REGEXP, MatchingRegion.PARENTHESIS)) { + res.device.architecture = "i686"; + res.operatingSystem.family = OSFamily.LINUX; + res.operatingSystem.description = "Linux"; + } else if (context.consume("x86_64-linux.*", MatchingType.REGEXP, MatchingRegion.PARENTHESIS)) { + res.device.architecture = "x86_64"; + res.operatingSystem.family = OSFamily.LINUX; + res.operatingSystem.description = "Linux"; + } + res.browser.description = "Atig (Ruby)"; + + context.consume("http.rb", MatchingType.EQUALS, MatchingRegion.PARENTHESIS); + context.consume("net-irc", MatchingType.EQUALS, MatchingRegion.PARENTHESIS); + } + if (ver != null) res.browser.setFullVersionOneShot(ver); + else if (rver != null) res.browser.setFullVersionOneShot(rver); + + return res; + } else if ((ver=context.getcVersionAfterPattern("Commons-HttpClient/",MatchingType.BEGINS, MatchingRegion.REGULAR)) != null || + (ver=context.getcVersionAfterPattern("Apache-HttpClient/",MatchingType.BEGINS, MatchingRegion.REGULAR)) != null) { + context.consume("Jakarta",MatchingType.EQUALS, MatchingRegion.REGULAR); + context.consume("java ",MatchingType.BEGINS, MatchingRegion.PARENTHESIS); + res.browser.family = BrowserFamily.LIBRARY; + res.browser.vendor = Brand.APACHE; + res.browser.description = "Commons HttpClient"; + res.browser.setFullVersionOneShot(ver); + res.operatingSystem = new OS(Brand.UNKNOWN,OSFamily.UNKNOWN,"",""); return res; } else if ((ver=context.getcVersionAfterPattern("Wget/",MatchingType.BEGINS, MatchingRegion.REGULAR)) != null) { res.browser.family = BrowserFamily.LIBRARY; @@ -3855,6 +3955,7 @@ static void consumeRandomGarbage(UserAgentContext context, UserAgentDetectionRes while (context.ignore("\\[xSP_2:[0-9a-f]+_[0-9]+\\]", MatchingType.REGEXP, MatchingRegion.PARENTHESIS)); context.ignore("[0-9]+", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); } + context.ignore("APCPMS=", MatchingType.BEGINS, MatchingRegion.PARENTHESIS); // ? context.ignore("BO[0-9]?IE[89](_v[0-9]+)?", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); // Bing optimized bullshit context.ignore("msn OptimizedIE8", MatchingType.REGEXP, MatchingRegion.PARENTHESIS); // Bing optimized bullshit context.ignore("Tucows", MatchingType.EQUALS, MatchingRegion.PARENTHESIS); // Dunno diff --git a/test-data/database.gz b/test-data/database.gz index 2154467..a68fb6c 100644 Binary files a/test-data/database.gz and b/test-data/database.gz differ