Skip to content

Commit

Permalink
feat(plugin) bot-detection (#1413)
Browse files Browse the repository at this point in the history
  • Loading branch information
Tieske authored and subnetmarco committed Jul 27, 2016
1 parent 56689a2 commit 8c18b48
Show file tree
Hide file tree
Showing 12 changed files with 513 additions and 3 deletions.
8 changes: 7 additions & 1 deletion kong-0.8.3-0.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ build = {

["kong.plugins.statsd.handler"] = "kong/plugins/statsd/handler.lua",
["kong.plugins.statsd.schema"] = "kong/plugins/statsd/schema.lua",
["kong.plugins.statsd.statsd_logger"] = "kong/plugins/statsd/statsd_logger.lua"
["kong.plugins.statsd.statsd_logger"] = "kong/plugins/statsd/statsd_logger.lua",

["kong.plugins.bot-detection.handler"] = "kong/plugins/bot-detection/handler.lua",
["kong.plugins.bot-detection.schema"] = "kong/plugins/bot-detection/schema.lua",
["kong.plugins.bot-detection.rules"] = "kong/plugins/bot-detection/rules.lua",
["kong.plugins.bot-detection.cache"] = "kong/plugins/bot-detection/cache.lua",
["kong.plugins.bot-detection.hooks"] = "kong/plugins/bot-detection/hooks.lua",
}
}
2 changes: 1 addition & 1 deletion kong/constants.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ local plugins = {
"file-log", "http-log", "key-auth", "hmac-auth", "basic-auth", "ip-restriction",
"galileo", "request-transformer", "response-transformer",
"request-size-limiting", "rate-limiting", "response-ratelimiting", "syslog",
"loggly", "datadog", "runscope", "ldap-auth", "statsd"
"loggly", "datadog", "runscope", "ldap-auth", "statsd", "bot-detection"
}

local plugin_map = {}
Expand Down
27 changes: 27 additions & 0 deletions kong/plugins/bot-detection/cache.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
local cache = require "kong.tools.database_cache"

local _M = {}

local INDEX = "bot_detection_index"

function _M.set(key, value)
cache.set(cache.bot_detection_key(key), value)
local index_keys = cache.get(INDEX)
if not index_keys then index_keys = {} end
index_keys[#index_keys+1] = key
cache.set(INDEX, index_keys)
end

function _M.get(key)
return cache.get(cache.bot_detection_key(key))
end

function _M.reset()
local index_keys = cache.get(INDEX)
for _, key in ipairs(index_keys) do
cache.delete(cache.bot_detection_key(key))
end
cache.delete(INDEX)
end

return _M
79 changes: 79 additions & 0 deletions kong/plugins/bot-detection/handler.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
local BasePlugin = require "kong.plugins.base_plugin"
local responses = require "kong.tools.responses"
local rules = require "kong.plugins.bot-detection.rules"
local bot_cache = require "kong.plugins.bot-detection.cache"
local strip = require("kong.tools.utils").strip

local ipairs = ipairs
local get_headers = ngx.req.get_headers
local re_match = ngx.re.match

local BotDetectionHandler = BasePlugin:extend()

BotDetectionHandler.PRIORITY = 2500

local function get_user_agent()
local user_agent = get_headers()["user-agent"]
if type(user_agent) == "table" then
return nil, "Only one User-Agent header allowed"
end
return user_agent
end

function BotDetectionHandler:new()
BotDetectionHandler.super.new(self, "bot-detection")
end

function BotDetectionHandler:access(conf)
BotDetectionHandler.super.access(self)

local user_agent, err = get_user_agent()
if err then
return responses.send_HTTP_BAD_REQUEST(err)
end

if user_agent then
user_agent = strip(user_agent)

-- Cache key, per API
local cache_key = ngx.ctx.api.id..":"..user_agent

-- The cache already has the user_agents that should be blocked
-- So we avoid matching the regexes everytime
local cached_match = bot_cache.get(cache_key)
if cached_match then
return
elseif cached_match == false then
return responses.send_HTTP_FORBIDDEN()
end

if conf.whitelist then
for _, rule in ipairs(conf.whitelist) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, true)
return
end
end
end

if conf.blacklist then
for _, rule in ipairs(conf.blacklist) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, false)
return responses.send_HTTP_FORBIDDEN()
end
end
end

for _, rule in ipairs(rules.bots) do
if re_match(user_agent, rule) then
bot_cache.set(cache_key, false)
return responses.send_HTTP_FORBIDDEN()
end
end

bot_cache.set(cache_key, true)
end
end

return BotDetectionHandler
14 changes: 14 additions & 0 deletions kong/plugins/bot-detection/hooks.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
local events = require "kong.core.events"
local bot_cache = require "kong.plugins.bot-detection.cache"

local function invalidate(message_t)
if message_t.collection == "plugins" and message_t.entity.name == "bot-detection" then
bot_cache.reset()
end
end

return {
[events.TYPES.ENTITY_UPDATED] = function(message_t)
invalidate(message_t)
end
}
18 changes: 18 additions & 0 deletions kong/plugins/bot-detection/rules.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- List taken from https://github.com/ua-parser/uap-core/blob/master/regexes.yaml

return {
bots = {
[[(Pingdom.com_bot_version_)(\d+)\.(\d+)]], -- Pingdom
[[(facebookexternalhit)/(\d+)\.(\d+)]], -- Facebook
[[Google.*/\+/web/snippet]], -- Google Plus
[[(Twitterbot)/(\d+)\.(\d+)]], -- Twitter
[[/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|Tailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots Pattern '/name-0.0'
[[(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goodzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|PathDefender|Peew|PostPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sproose|wminer)/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], --Bots Pattern 'name/0.0'
[[(MSIE) (\d+)\.(\d+)([a-z]\d?)?;.* MSIECrawler]], --MSIECrawler
[[(Google-HTTP-Java-Client|Apache-HttpClient|http%20client|Python-urllib|HttpMonitor|TLSProber|WinHTTP|JNLP)(?:[ /](\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Downloader ...
[[(1470\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]+-Agent|AdsBot-Google(?:-[a-z]+)?|altavista|AppEngine-Google|archive.*?\.org_bot|archiver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]+)*|bingbot|BingPreview|blitzbot|BlogBridge|BoardReader(?: [A-Za-z]+)*|boitho.com-dc|BotSeer|\b\w*favicon\w*\b|\bYeti(?:-[a-z]+)?|Catchpoint bot|[Cc]harlotte|Checklinks|clumboot|Comodo HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConvera|Daumoa(?:-feedfetcher)?|Feed Seeker Bot|findlinks|Flamingo_SearchEngine|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]+-)?Googlebot(?:-[a-zA-Z]+)?|Google SketchUp|grub-client|gsa-crawler|heritrix|HiddenMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobile)?|IconSurf|IlTrovatore(?:-Setaccio)?|InfuzApp|Innovazion Crawler|InternetArchive|IP2[a-z]+Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Linguee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masidani_bot|Mediapartners-Google|Microsoft .*? Bot|mogimogi|mozDex|MJ12bot|msnbot(?:-media *)?|msrbot|netresearch|Netvibes|NewsGator[^/]*|^NING|Nutch[^/]*|Nymesis|ObjectsSearch|Orbiter|OOZBOT|PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobot|Pompos|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scooter|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|Simpy|SimplePie|SEOstats|SimpleRSS|SiteCon|Slurp|snappy|Speedy Spider|Squrl Java|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|TwitterBot|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voyager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]+|WhatWeb|WIRE|WordPress|Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s)? Link Sleuth|Xerka [A-z]+Bot|yacy(?:bot)?|Yahoo[a-z]*Seeker|Yahoo! Slurp|Yandex\w+|YodaoBot(?:-[A-z]+)?|YottaaMonitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]], -- Bots
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 \-_\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*))/(\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name/0.0'
[[(?:\/[A-Za-z0-9\.]+)? *([A-Za-z0-9 _\!\[\]:]*(?:[Aa]rchiver|[Ii]ndexer|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]*)) (\d+)(?:\.(\d+)(?:\.(\d+))?)?]], -- Bots General matcher 'name 0.0'
[[((?:[A-z0-9]+|[A-z\-]+ ?)?(?: the )?(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]crape|[A-Za-z0-9-]*(?:[^C][^Uu])[Bb]ot|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]*)(?:(?:[ /]| v)(\d+)(?:\.(\d+)(?:\.(\d+))?)?)?]] -- Bots containing spider|scrape|bot(but not CUBOT)|Crawl
}
}
21 changes: 21 additions & 0 deletions kong/plugins/bot-detection/schema.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
local re_match = ngx.re.match

local check_regex = function(value)
if value then
for _, rule in ipairs(value) do
local _, err = re_match("just a string to test", rule)
if err then
return false, "value '"..rule.."' is not a valid regex"
end
end
end
return true
end

return {
no_consumer = true,
fields = {
whitelist = { type = "array", func = check_regex },
blacklist = { type = "array", func = check_regex },
}
}
7 changes: 6 additions & 1 deletion kong/tools/database_cache.lua
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ local CACHE_KEYS = {
AUTOJOIN_RETRIES = "autojoin_retries",
TIMERS = "timers",
ALL_APIS_BY_DIC = "ALL_APIS_BY_DIC",
LDAP_CREDENTIAL = "ldap_credentials"
LDAP_CREDENTIAL = "ldap_credentials",
BOT_DETECTION = "bot_detection"
}

local _M = {}
Expand Down Expand Up @@ -117,6 +118,10 @@ function _M.ssl_data(api_id)
return CACHE_KEYS.SSL..":"..api_id
end

function _M.bot_detection_key(key)
return CACHE_KEYS.BOT_DETECTION..":"..key
end

function _M.all_apis_by_dict_key()
return CACHE_KEYS.ALL_APIS_BY_DIC
end
Expand Down
149 changes: 149 additions & 0 deletions spec/03-plugins/bot-detection/01-access_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
local helpers = require "spec.helpers"

local HELLOWORLD = "HelloWorld" -- just a test value
local FACEBOOK = "facebookexternalhit/1.1" -- matches a known bot in `rules.lua`

describe("Plugin: bot-detection (access)", function()

local client

setup(function()
helpers.prepare_prefix()

local api1 = assert(helpers.dao.apis:insert {
request_host = "bot.com",
upstream_url = "http://mockbin.com"
})
local api2 = assert(helpers.dao.apis:insert {
request_host = "bot2.com",
upstream_url = "http://mockbin.com"
})
local api3 = assert(helpers.dao.apis:insert {
request_host = "bot3.com",
upstream_url = "http://mockbin.com"
})

-- plugin 1
assert(helpers.dao.plugins:insert {
api_id = api1.id,
name = "bot-detection",
config = {},
})
-- plugin 2
assert(helpers.dao.plugins:insert {
api_id = api2.id,
name = "bot-detection",
config = {
blacklist = HELLOWORLD
},
})
-- plugin 3
assert(helpers.dao.plugins:insert {
api_id = api3.id,
name = "bot-detection",
config = {
whitelist = FACEBOOK
},
})

assert(helpers.start_kong())
end)

teardown(function()
helpers.stop_kong()
end)

before_each(function()
client = assert(helpers.proxy_client())
end)

after_each(function()
if client then client:close() end
end)

it("allows regular requests", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = { host = "bot.com" }
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = HELLOWORLD
}
})
assert.response(res).has.status(200)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "curl/7.43.0"
}
})
assert.response(res).has.status(200)
end)

it("blocks bots", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = "Googlebot/2.1 (+http://www.google.com/bot.html)"
},
})
assert.response(res).has.status(403)

local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot.com",
["user-agent"] = FACEBOOK,
}
})
assert.response(res).has.status(403)
end)

it("blocks blacklisted user-agents", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot2.com",
["user-agent"] = HELLOWORLD,
}
})
assert.response(res).has.status(403)
end)

it("allows whitelisted user-agents", function()
local res = assert( client:send {
method = "GET",
path = "/request",
headers = {
host = "bot3.com",
["user-agent"] = FACEBOOK
}
})
assert.response(res).has.status(200)
end)

end)
Loading

0 comments on commit 8c18b48

Please sign in to comment.