Skip to content

Commit

Permalink
Update crawlers/tests
Browse files Browse the repository at this point in the history
  • Loading branch information
moskrc committed Aug 30, 2022
1 parent 011ad1c commit eb2f95e
Show file tree
Hide file tree
Showing 13 changed files with 1,552 additions and 1,536 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
## About CrawlerDetect

**CrawlerDetect** is a Python version of PHP class @[CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect).

It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect 1,000's of bots/spiders/crawlers.
This is a Python wrapper for [CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect) - the web crawler detection library
It helps to detect bots/crawlers/spiders via the user agent and other HTTP-headers. Currently able to detect > 1,000's of bots/spiders/crawlers.

### Installation
Run `pip install crawlerdetect`
Expand Down
5 changes: 1 addition & 4 deletions crawlerdetect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,4 @@
from .src import providers
from .src.crawlerdetect import CrawlerDetect

__all__ = (
'CrawlerDetect',
'providers'
)
__all__ = ("CrawlerDetect", "providers")
11 changes: 5 additions & 6 deletions crawlerdetect/__main__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import configparser
import os
import sys

import configparser


def get_crawlerdetect_version():
config = configparser.ConfigParser()

current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
config_file_path = os.path.join(parent_directory, 'setup.cfg')
config_file_path = os.path.join(parent_directory, "setup.cfg")

config.read(config_file_path)

return config['crawlerdetect']['version']
return config["crawlerdetect"]["version"]


if __name__ == '__main__':
if '--version' in sys.argv:
if __name__ == "__main__":
if "--version" in sys.argv:
print(get_crawlerdetect_version())
17 changes: 7 additions & 10 deletions crawlerdetect/src/crawlerdetect.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import re

from .providers import Exclusions
from .providers import Crawlers
from .providers import Headers
from .providers import Crawlers, Exclusions, Headers


class CrawlerDetect(object):

def __init__(self, headers=None, user_agent=''):
def __init__(self, headers=None, user_agent=""):
self.crawlers = Crawlers()
self.exclusions = Exclusions()
self.uaHttpHeaders = Headers()
Expand All @@ -24,16 +21,16 @@ def setHttpHeaders(self, http_headers):

if http_headers:
for k, v in http_headers.items():
if k.find('HTTP_') == 0:
if k.find("HTTP_") == 0:
self.httpHeaders[k] = v

def setUserAgent(self, user_agent=None):
if not user_agent:
ua = ''
ua = ""

for altHeader in self.getUaHttpHeaders():
if altHeader in self.httpHeaders:
ua += self.httpHeaders[altHeader] + ' '
ua += self.httpHeaders[altHeader] + " "

self.user_agent = ua
else:
Expand All @@ -49,7 +46,7 @@ def compileRegex(self, patterns):
"""
Combine regexps
"""
return '({})'.format('|'.join(patterns))
return "({})".format("|".join(patterns))

def isCrawler(self, user_agent=None):
if not user_agent:
Expand All @@ -58,7 +55,7 @@ def isCrawler(self, user_agent=None):
else:
return False

agent = re.sub(self.compiledExclusions, '', user_agent, flags=re.IGNORECASE)
agent = re.sub(self.compiledExclusions, "", user_agent, flags=re.IGNORECASE)

if not agent:
return False
Expand Down
7 changes: 1 addition & 6 deletions crawlerdetect/src/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,4 @@
from .exclusions import Exclusions
from .headers import Headers


__all__ = (
'Crawlers',
'Exclusions',
'Headers'
)
__all__ = ("Crawlers", "Exclusions", "Headers")
Loading

0 comments on commit eb2f95e

Please sign in to comment.