From 45cb6db1ab5e5cd6554bed5ded015fc97201c089 Mon Sep 17 00:00:00 2001 From: Bruno Skvorc Date: Wed, 20 May 2015 23:00:30 +0000 Subject: [PATCH] Implemented Crawlbot, this closes #4 and closes #1 --- .scrutinizer.yml | 2 +- .travis.yml | 2 - CHANGELOG.md | 23 + README.md | 105 ++- TODO.md | 13 +- src/Abstracts/Api.php | 63 +- src/Abstracts/Job.php | 338 +++++++++ src/Api/Analyze.php | 4 +- src/Api/Article.php | 2 +- src/Api/Crawl.php | 518 ++++++++++++++ src/Api/Custom.php | 2 +- src/Api/Discussion.php | 2 +- src/Api/Image.php | 2 +- src/Api/Product.php | 2 +- src/Diffbot.php | 22 + src/Entity/EntityIterator.php | 20 +- src/Entity/JobBulk.php | 9 + src/Entity/JobCrawl.php | 51 ++ src/Interfaces/Api.php | 13 + src/Traits/DiffbotAware.php | 26 + tests/Api/AnalyzeApiTest.php | 12 +- tests/Api/ArticleApiTest.php | 12 +- tests/Api/CrawlCustomMocksTest.php | 125 ++++ tests/Api/CrawlTest.php | 378 +++++++++++ tests/Api/CustomApiTest.php | 12 +- tests/Api/DiscussionApiTest.php | 12 +- tests/Api/ImageApiTest.php | 8 +- tests/Api/ProductApiTest.php | 6 +- tests/DiffbotTest.php | 9 + tests/Entity/CrawlJobTest.php | 639 ++++++++++++++++++ .../15-05-18/sitepoint_01_deleted.json | 11 + .../15-05-18/sitepoint_01_maxCrawled.json | 58 ++ .../Crawlbot/15-05-20/deletedSuccess.json | 11 + .../Mocks/Crawlbot/15-05-20/invalid_name.json | 10 + .../Crawlbot/15-05-20/invalid_response.json | 11 + .../Crawlbot/15-05-20/multiplejobs01.json | 96 +++ .../15-05-20/sitepoint_01_paused.json | 56 ++ .../15-05-20/sitepoint_01_restart.json | 58 ++ .../15-05-20/sitepoint_01_roundstart.json | 56 ++ .../15-05-20/sitepoint_01_unpaused.json | 56 ++ 40 files changed, 2767 insertions(+), 88 deletions(-) create mode 100644 src/Abstracts/Job.php create mode 100644 src/Api/Crawl.php create mode 100644 src/Entity/JobBulk.php create mode 100644 src/Entity/JobCrawl.php create mode 100644 src/Traits/DiffbotAware.php create mode 100644 tests/Api/CrawlCustomMocksTest.php create mode 100644 tests/Api/CrawlTest.php create mode 100644 tests/Entity/CrawlJobTest.php create mode 100644 tests/Mocks/Crawlbot/15-05-18/sitepoint_01_deleted.json create mode 100644 tests/Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/deletedSuccess.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/invalid_name.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/invalid_response.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/multiplejobs01.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json create mode 100644 tests/Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 2f8ae48..af89f30 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -18,4 +18,4 @@ checks: tools: external_code_coverage: timeout: 600 - runs: 3 \ No newline at end of file + runs: 1 \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index ae59de1..735d3b4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,6 @@ language: php php: - - 5.4 - - 5.5 - 5.6 - 7.0 - hhvm diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b2e3e5..4d65b77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,29 @@ #Changelog All notable changes will be documented in this file +## 0.3 - May 17th, 2015 + +### Internal changes + +- [Internal] DiffbotAware trait now responsible for registering Diffbot parent in children +- [BC Break, Internal] PHP 5.6 is now required (`...` operator) +- [Internal] Updated all API calls to HTTPS + +### Features + +- [Feature] Implemented Crawlbot API, added usage example to README + - [Feature] Added `Job` abstract entity with `JobCrawl` and `JobBulk` derivations. A `Job` is either a [Bulk API job](https://www.diffbot.com/dev/docs/bulk) or a [Crawl job](https://www.diffbot.com/dev/docs/crawl). A collection of jobs is the result of a Crawl or Bulk API call. When job name is provided, a max of one item is present in the collection. + +### Bugs + +- [Bug] Fixed [#1](https://github.com/Swader/diffbot-php-client/issues/1) + +### Meta + +- [Repository] Added TODOs as issues in repo, linked to relevant ones in [TODO file](TODO.md). +- [CI] Stopped testing for 5.4 and 5.5, updated Travis and Scrutinizer file to take this into account +- [Tests] Fully tested Crawlbot implementation + ## 0.2 - May 2nd, 2015 - added Discussion API diff --git a/README.md b/README.md index 78a3849..8a9dc7c 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Right now it only supports Analyze, Product, Image, Discussion and Article APIs, ## Requirements -Minimum PHP 5.4 because Guzzle needs it. +Minimum PHP 5.6 is required. When installed via Composer, the library will pull in Guzzle 5 as well, so it's recommended you have cURL installed, but not required. ## Install @@ -59,7 +59,7 @@ Currently available [*automatic*](http://www.diffbot.com/products/automatic/) AP - [discussion](http://www.diffbot.com/products/automatic/discussion/) (fetches discussion / review / comment threads - can be embedded in the Product or Article return data, too, if those contain any comments or discussions) - [analyze](http://www.diffbot.com/products/automatic/analyze/) (combines all the above in that it automatically determines the right API for the URL and applies it) -Video is coming soon. +Video is coming soon. See below for instructions on Crawlbot, Search and Bulk API. There is also a [Custom API](http://www.diffbot.com/products/custom/) like [this one](http://www.sitepoint.com/analyze-sitepoint-author-portfolios-diffbot/) - unless otherwise configured, they return instances of the Wildcard entity) @@ -200,7 +200,7 @@ Used just like all others. There are only two differences: The following is a usage example of my own custom API for author profiles at SitePoint: ```php -$diffbot = new Diffbot('brunoskvorc'); +$diffbot = new Diffbot('my_token'); $customApi = $diffbot->createCustomAPI('http://sitepoint.com/author/bskvorc', 'authorFolioNew'); $return = $customApi->call(); @@ -213,6 +213,105 @@ foreach ($return as $wildcard) { Of course, you can easily extend the basic Custom API class and make your own, as well as add your own Entities that perfectly correspond to the returned data. This will all be covered in a tutorial in the near future. +## Crawlbot and Bulk API + +Basic Crawlbot support has been added to the library. +To find out more about Crawlbot and what, how and why it does what it does, see [here](https://www.diffbot.com/dev/docs/crawl/). +I also recommend reading the [Crawlbot API docs](https://www.diffbot.com/dev/docs/crawl/api.jsp) and the [Crawlbot support topics](http://support.diffbot.com/topics/crawlbot/) just so you can dive right in without being too confused by the code below. + +In a nutshell, the Crawlbot crawls a set of seed URLs for links (even if a subdomain is passed to it as seed URL, it still looks through the entire main domain and all other subdomains it can find) and then processes all the pages it can find using the API you define (or opting for Analyze API by default). + +### List of all crawl / bulk jobs + +A joint list of all your crawl / bulk jobs can be fetched via: + +``` +$diffbot = new Diffbot('my_token'); +$jobs = $diffbot->crawl()->call(); +``` + +This returns a collection of all crawl and bulk jobs. Each type is represented by its own class: `JobCrawl` and `JobBulk`. It's important to note that Jobs only contain the information about the job - not the data. To get the data of a job, use the `downloadUrl` method to get the URL to the dataset: + +``` +$url = $job->downloadUrl("json"); +``` + +### Crawl jobs: Creating a Crawl Job + +See inline comments for step by step explanation + +``` +// Create new diffbot as usual +$diffbot = new Diffbot('my_token'); + +// The crawlbot needs to be told which API to use to process crawled pages. This is optional - if omitted, it will be told to use the Analyze API with mode set to auto. +// The "crawl" url is a flag to tell APIs to prepare for consumption with Crawlbot, letting them know they won't be used directly. +$url = 'crawl'; +$articleApi = $diffbot->createArticleAPI($url)->setDiscussion(false); + +// Make a new crawl job. Optionally, pass in API instance +$crawl = $diffbot->crawl('sitepoint_01', $articleApi); + +// Set seeds - seeds are URLs to crawl. By default, passing a subdomain into the crawl will also crawl other subdomains on main domain, including www. +$crawl->setSeeds(['http://sitepoint.com']); + +// Call as usual - an EntityIterator collection of results is returned. In the case of a job's creation, only one job entity will always be returned. +$job = $crawl->call(); + +// See JobCrawl class to find out which getters are available +dump($job->getDownloadUrl("json")); // outputs download URL to JSON dataset of the job's result +``` + +### Crawl jobs: Inspecting an existing Crawl Job + +To get data about a job (this will be the data it was configured with - its flags - and not the results!), use the exact same approach as if creating a new one, only without the API and seeds: + +``` +$diffbot = new Diffbot('my_token'); + +$crawl = $diffbot->crawl('sitepoint_01'); + +$job = $crawl->call(); + +dump($job->getDownloadUrl("json")); // outputs download URL to JSON dataset of the job's result +``` + +### Crawl jobs: Modifying an existing Crawl Job + +While there is no way to alter a crawl job's configuration post creation, you can still do some operations on it. + +Provided you fetched a `$crawl` instance as in the above section on inspecting, you can do the following: + +``` +// Force start of a new crawl round manually +$crawl->roundStart(); + +// Pause or unpause (0) a job +$crawl->pause(); +$crawl->pause(0) + +// Restart removes all crawled data but keeps the job (and settings) +$crawl->restart(); + +// Delete a job and all related data +$crawl->delete(); +``` + +Note that it is not necessary to issue a `call()` after these methods. + +If you would like to extract the generated API call URL for these instant-call actions, pass in the parameter `false`, like so: + +``` +$crawl->delete(false); +``` + +You can then save the URL for your convenience and call `call` when ready to execute (if at all). + +``` +$url = $crawl->buildUrl(); +$url->call(); +``` + ## Testing Just run PHPUnit in the root folder of the cloned project. diff --git a/TODO.md b/TODO.md index b2d410c..435aa75 100644 --- a/TODO.md +++ b/TODO.md @@ -4,16 +4,21 @@ Active todos, ordered by priority ## High -- implement Crawlbot -- implement Search API +- [implement Bulk Processing Support](https://github.com/Swader/diffbot-php-client/issues/3) +- [implement Search API](https://github.com/Swader/diffbot-php-client/issues/2) ## Medium -- add streaming to Crawlbot - make it stream the result (it constantly grows) -- implement Video API (currently beta) +- [add streaming to Crawlbot](https://github.com/Swader/diffbot-php-client/issues/5) +- [implement Video API](https://github.com/Swader/diffbot-php-client/issues/6) (currently beta) +- [implement Webhook](https://github.com/Swader/diffbot-php-client/issues/7) for Bulk / Crawlbot completion +- look into adding async support via Guzzle +- consider alternative solution to 'crawl' setting in Api abstract ([#8](https://github.com/Swader/diffbot-php-client/issues/8)). +- API docs needed ([#9](https://github.com/Swader/diffbot-php-client/issues/3)) ## Low +- see what can be done with the [URL report](https://www.diffbot.com/dev/docs/crawl/) - some implementation options? - add more usage examples - work on PhpDoc consistency ($param type vs type $param) - get more mock responses and test against them diff --git a/src/Abstracts/Api.php b/src/Abstracts/Api.php index 69a8fe1..27b79fa 100644 --- a/src/Abstracts/Api.php +++ b/src/Abstracts/Api.php @@ -3,6 +3,7 @@ namespace Swader\Diffbot\Abstracts; use Swader\Diffbot\Diffbot; +use Swader\Diffbot\Traits\DiffbotAware; /** * Class Api @@ -28,26 +29,29 @@ abstract class Api implements \Swader\Diffbot\Interfaces\Api /** @var Diffbot The parent class which spawned this one */ protected $diffbot; + use DiffbotAware; public function __construct($url) { - $url = trim((string)$url); - if (strlen($url) < 4) { - throw new \InvalidArgumentException( - 'URL must be a string of at least four characters in length' - ); - } - - $url = (isset(parse_url($url)['scheme'])) ? $url : "http://$url"; - - $filtered_url = filter_var($url, FILTER_VALIDATE_URL); - if (!$filtered_url) { - throw new \InvalidArgumentException( - 'You provided an invalid URL: ' . $url - ); + if (strcmp($url, 'crawl') !== 0) { + $url = trim((string)$url); + if (strlen($url) < 4) { + throw new \InvalidArgumentException( + 'URL must be a string of at least four characters in length' + ); + } + + $url = (isset(parse_url($url)['scheme'])) ? $url : "http://$url"; + + $filtered_url = filter_var($url, FILTER_VALIDATE_URL); + if (!$filtered_url) { + throw new \InvalidArgumentException( + 'You provided an invalid URL: ' . $url + ); + } + $url = $filtered_url; } - - $this->url = $filtered_url; + $this->url = $url; } /** @@ -91,14 +95,15 @@ public function call() public function buildUrl() { - $url = rtrim($this->apiUrl, '/'); + $url = rtrim($this->apiUrl, '/').'?'; - // Add Token - $url .= '?token=' . $this->diffbot->getToken(); - - // Add URL - $url .= '&url=' . urlencode($this->url); + if (strcmp($url,'crawl') !== 0) { + // Add Token + $url .= 'token=' . $this->diffbot->getToken(); + // Add URL + $url .= '&url=' . urlencode($this->url); + } // Add Custom Fields $fields = $this->fieldSettings; @@ -118,18 +123,4 @@ public function buildUrl() return $url; } - - /** - * Sets the Diffbot instance on the child class - * Used to later fetch the token, HTTP client, EntityFactory, etc - * @param Diffbot $d - * @return $this - */ - public function registerDiffbot(Diffbot $d) - { - $this->diffbot = $d; - - return $this; - } - } diff --git a/src/Abstracts/Job.php b/src/Abstracts/Job.php new file mode 100644 index 0000000..973bf3d --- /dev/null +++ b/src/Abstracts/Job.php @@ -0,0 +1,338 @@ +data['name']; + } + + /** + * Should always return either "crawl" or "bulk" + * @return string + */ + public function getType() + { + return $this->data['type']; + } + + /** + * Timestamp of job creation + * + * @return int + */ + public function getJobCreationTimeUTC() + { + return (isset($this->data['jobCreationTimeUTC'])) + ? (int)$this->data['jobCreationTimeUTC'] : null; + } + + /** + * Timestamp of job completion + * + * @return int + */ + public function getJobCompletionTimeUTC() + { + return (isset($this->data['jobCompletionTimeUTC'])) + ? (int)$this->data['jobCompletionTimeUTC'] : null; + } + + /** + * Possible statuses + * + * 0 Job is initializing + * 1 Job has reached maxRounds limit + * 2 Job has reached maxToCrawl limit + * 3 Job has reached maxToProcess limit + * 4 Next round to start in _____ seconds + * 5 No URLs were added to the crawl + * 6 Job paused + * 7 Job in progress + * 8 All crawling temporarily paused by root administrator for maintenance. + * 9 Job has completed and no repeat is scheduled + * + * @return array + */ + public function getJobStatus() + { + return (isset($this->data['jobStatus'])) + ? $this->data['jobStatus'] : []; + } + + /** + * True or false, depending on whether "job complete" notification was sent + * + * @return bool + */ + public function getNotificationSent() + { + return (bool)$this->data['sentJobDoneNotification']; + } + + /** + * Number of objects found + * + * @return int + */ + public function getObjectsFound() + { + return (int)$this->data['objectsFound']; + } + + /** + * Number of URLs harvested + * + * @return int + */ + public function getUrlsHarvested() + { + return (int)$this->data['urlsHarvested']; + } + + /** + * Returns an array with information about crawls - total attempts, + * successes, and successes this round + * + * @return array + */ + public function getPageCrawlInfo() + { + return [ + 'attempts' => $this->data['pageCrawlAttempts'], + 'successes' => $this->data['pageCrawlSuccesses'], + 'successesThisRound' => $this->data['pageCrawlSuccessesThisRound'] + ]; + } + + /** + * Returns an array with information about crawls - total attempts, + * successes, and successes this round + * + * @return array + */ + public function getPageProcessInfo() + { + return [ + 'attempts' => $this->data['pageProcessAttempts'], + 'successes' => $this->data['pageProcessSuccesses'], + 'successesThisRound' => $this->data['pageProcessSuccessesThisRound'] + ]; + } + + /** + * The maximum number of crawl repeats. By default (maxRounds=0) repeating + * crawls will continue indefinitely. + * + * @return int + */ + public function getMaxRounds() + { + return (int)$this->data['maxRounds']; + } + + /** + * The number of days as a floating-point (e.g. repeat=7.0) to repeat this + * crawl. By default crawls will not be repeated. + * + * @return float + */ + public function getRepeat() + { + return (float)$this->data['repeat']; + } + + /** + * Wait this many seconds between each URL crawled from a single IP address. + * Number of seconds as an integer or floating-point number + * (e.g., crawlDelay=0.25). + * + * @return float + */ + public function getCrawlDelay() + { + return (float)$this->data['crawlDelay']; + } + + /** + * Whether or not the job was set to respect robots.txt + * + * @return bool + */ + public function getObeyRobots() + { + return (bool)$this->data['obeyRobots']; + } + + /** + * How many rounds were completed with the job so far + * + * @return int + */ + public function getRoundsCompleted() + { + return (int)$this->data['roundsCompleted']; + } + + /** + * Returns timestamp of when next crawl round is about to start or 0 if none + * + * @return int + */ + public function getRoundStartTime() + { + return (int)$this->data['roundStartTime']; + } + + /** + * Returns timestamp of current time + * + * @return int + */ + public function getCurrentTime() + { + return (int)$this->data['currentTime']; + } + + /** + * Returns timestamp of current time, UTC. + * Should be the same as getCurrentTime + * + * @return int + */ + public function getCurrentTimeUTC() + { + return (int)$this->data['currentTimeUTC']; + } + + /** + * The API URL is the URL of the API used to process pages found in the + * crawl. If the job was created with this Diffbot lib, then it was + * automatically built from a pre-configured API instance + * + * The API URL will be URL decoded, whereas it is submitted encoded. + * + * @return string + */ + public function getApiUrl() + { + return (string)$this->data['apiUrl']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setUrlCrawlPattern + * @return string + */ + public function getUrlCrawlPattern() + { + return (string)$this->data['urlCrawlPattern']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setUrlProcessPattern + * @return string + */ + public function getUrlProcessPattern() + { + return (string)$this->data['urlProcessPattern']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setPageProcessPattern + * @return string + */ + public function getPageProcessPattern() + { + return (string)$this->data['pageProcessPattern']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setUrlCrawlRegex + * + * @return string + */ + public function getUrlCrawlRegex() + { + return (string)$this->data['urlCrawlRegEx']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setUrlProcessRegex + * + * @return string + */ + public function getUrlProcessRegex() + { + return (string)$this->data['urlProcessRegEx']; + } + + /** + * @see \Swader\Diffbot\Api\Crawl::setMaxHops + * + * @return int + */ + public function getMaxHops() + { + return (int)$this->data['maxHops']; + } + + /** + * Returns the link to the dataset the job produced. + * + * Accepted arguments are: "json", "csv" and "debug". + * It is important to be aware of the difference between the types. + * See "Retrieving Bulk Data" in link. + * + * @see https://www.diffbot.com/dev/docs/crawl/api.jsp + * + * @param string $type + * @return string + * @throws DiffbotException + */ + public function getDownloadUrl($type = "json") + { + switch ($type) { + case "json": + return $this->data['downloadJson']; + case "debug": + return $this->data['downloadUrls']; + case "csv": + return rtrim($this->data['downloadJson'], '.json') . '.csv'; + default: + break; + } + + throw new \InvalidArgumentException( + 'Only json, debug, or csv download link available. You asked for: ' + . $type); + } + + /** + * Returns the email that was set to be notified after job's completion + * + * @return string + */ + public function getNotifyEmail() + { + return (string)$this->data['notifyEmail']; + } + + /** + * Returns the webhook that was set to be pinged after job's completion + * + * @return string + */ + public function getNotifyWebhook() + { + return (string)$this->data['notifyWebhook']; + } +} \ No newline at end of file diff --git a/src/Api/Analyze.php b/src/Api/Analyze.php index 22cbe80..1daae2b 100644 --- a/src/Api/Analyze.php +++ b/src/Api/Analyze.php @@ -10,7 +10,7 @@ class Analyze extends Api use StandardApi; /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3/analyze'; + protected $apiUrl = 'https://api.diffbot.com/v3/analyze'; /** * If set to false, will not extract article comments in a Discussion @@ -37,7 +37,7 @@ public function setDiscussion($bool = true) */ public function setMode($mode) { - if (!in_array($mode, ['article', 'product', 'image'])) { + if (!in_array($mode, ['article', 'product', 'image', 'auto'])) { $error = 'Only "article", "product" and "image" modes supported.'; throw new \InvalidArgumentException($error); } diff --git a/src/Api/Article.php b/src/Api/Article.php index a325f8d..9edc6ea 100644 --- a/src/Api/Article.php +++ b/src/Api/Article.php @@ -10,7 +10,7 @@ class Article extends Api use StandardApi; /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3/article'; + protected $apiUrl = 'https://api.diffbot.com/v3/article'; /** * @see Swader\Diffbot\Entity\Article::getSentiment() diff --git a/src/Api/Crawl.php b/src/Api/Crawl.php new file mode 100644 index 0000000..b846723 --- /dev/null +++ b/src/Api/Crawl.php @@ -0,0 +1,518 @@ +name = $name; + if ($api) { + $this->setApi($api); + } + } + } + + /** + * Returns the unique name of the crawljob + * This name is later used to download datasets, or to modify the job + * + * @return string + */ + public function getName() + { + return $this->name; + } + + /** + * API which should be used to process the pages + * + * Accepts a fully formed instance of any other API. Will use it to build + * and auto-encode the URL. To satisfy the required $url param of the API + * classes, use the string 'crawl' which prepares the API for Crawlbot + * consumption internally. + * + * @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs + * @param Api $api + * @return $this + */ + public function setApi(Api $api) + { + $this->api = $api; + + return $this; + } + + /** + * An array of URLs (seeds) which to crawl for matching links + * + * By default Crawlbot will restrict spidering to the entire domain + * ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com"). + * + * @param array $seeds + * @return $this + */ + public function setSeeds(array $seeds) + { + $invalidSeeds = []; + foreach ($seeds as $seed) { + if (!filter_var($seed, FILTER_VALIDATE_URL)) { + $invalidSeeds[] = $seed; + } + } + if (!empty($invalidSeeds)) { + throw new \InvalidArgumentException( + 'Some seeds were invalid: ' . implode(',', $invalidSeeds) + ); + } + + $this->seeds = $seeds; + + return $this; + } + + /** + * Array of strings to limit pages crawled to those whose URLs + * contain any of the content strings. + * + * You can use the exclamation point to specify a negative string, e.g. + * !product to exclude URLs containing the string "product," and the ^ and + * $ characters to limit matches to the beginning or end of the URL. + * + * The use of a urlCrawlPattern will allow Crawlbot to spider outside of + * the seed domain; it will follow all matching URLs regardless of domain. + * + * @param array $pattern + * @return $this + */ + public function setUrlCrawlPatterns(array $pattern = null) + { + $this->otherOptions['urlCrawlPattern'] = ($pattern === null) ? null + : implode("||", array_map(function ($item) { + return urlencode($item); + }, $pattern)); + + return $this; + } + + /** + * Specify a regular expression to limit pages crawled to those URLs that + * match your expression. This will override any urlCrawlPattern value. + * + * The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the + * seed domain; it will follow all matching URLs regardless of domain. + * + * @param $regex + * @return $this + */ + public function setUrlCrawlRegEx($regex) + { + $this->otherOptions['urlCrawlRegEx'] = $regex; + + return $this; + } + + /** + * Specify ||-separated strings to limit pages processed to those whose + * URLs contain any of the content strings. + * + * You can use the exclamation point to specify a negative string, e.g. + * !/category to exclude URLs containing the string "/category," and the ^ + * and $ characters to limit matches to the beginning or end of the URL. + * + * @param array $pattern + * @return $this + */ + public function setUrlProcessPatterns(array $pattern = null) + { + $this->otherOptions['urlProcessPattern'] = ($pattern === null) ? null + : implode("||", array_map(function ($item) { + return urlencode($item); + }, $pattern)); + + return $this; + } + + /** + * Specify a regular expression to limit pages processed to those URLs that + * match your expression. This will override any urlProcessPattern value. + * + * @param $regex + * @return $this + */ + public function setUrlProcessRegEx($regex) + { + $this->otherOptions['urlProcessRegEx'] = $regex; + + return $this; + + } + + /** + * Specify ||-separated strings to limit pages processed to those whose + * HTML contains any of the content strings. + * + * @param array $pattern + * @return $this + */ + public function setPageProcessPatterns(array $pattern) + { + $this->otherOptions['pageProcessPattern'] = implode("||", + array_map(function ($item) { + return urlencode($item); + }, $pattern)); + + return $this; + } + + /** + * Specify the depth of your crawl. A maxHops=0 will limit processing to + * the seed URL(s) only -- no other links will be processed; maxHops=1 will + * process all (otherwise matching) pages whose links appear on seed URL(s); + * maxHops=2 will process pages whose links appear on those pages; and so on + * + * By default, Crawlbot will crawl and process links at any depth. + * + * @param int $input + * @return $this + */ + public function setMaxHops($input = -1) + { + if ((int)$input < -1) { + $input = -1; + } + $this->otherOptions['maxHops'] = (int)$input; + + return $this; + } + + /** + * Specify max pages to spider. Default: 100,000. + * + * @param int $input + * @return $this + */ + public function setMaxToCrawl($input = 100000) + { + if ((int)$input < 1) { + $input = 1; + } + $this->otherOptions['maxToCrawl'] = (int)$input; + + return $this; + } + + /** + * Specify max pages to process through Diffbot APIs. Default: 100,000. + * + * @param int $input + * @return $this + */ + public function setMaxToProcess($input = 100000) + { + if ((int)$input < 1) { + $input = 1; + } + $this->otherOptions['maxToProcess'] = (int)$input; + + return $this; + } + + /** + * If input is email address, end a message to this email address when the + * crawl hits the maxToCrawl or maxToProcess limit, or when the crawl + * completes. + * + * If input is URL, you will receive a POST with X-Crawl-Name and + * X-Crawl-Status in the headers, and the full JSON response in the + * POST body. + * + * @param $string + * @return $this + * @throws InvalidArgumentException + */ + public function notify($string) + { + if (filter_var($string, FILTER_VALIDATE_EMAIL)) { + $this->otherOptions['notifyEmail'] = $string; + + return $this; + } + if (filter_var($string, FILTER_VALIDATE_URL)) { + $this->otherOptions['notifyWebhook'] = urlencode($string); + + return $this; + } + + throw new InvalidArgumentException( + 'Only valid email or URL accepted! You provided: ' . $string + ); + } + + /** + * Wait this many seconds between each URL crawled from a single IP address. + * Specify the number of seconds as an integer or floating-point number. + * + * @param float $input + * @return $this + * @throws InvalidArgumentException + */ + public function setCrawlDelay($input = 0.25) + { + if (!is_numeric($input)) { + throw new InvalidArgumentException('Input must be numeric.'); + } + $input = ($input < 0) ? 0.25 : $input; + $this->otherOptions['crawlDelay'] = (float)$input; + + return $this; + } + + /** + * Specify the number of days as a floating-point (e.g. repeat=7.0) to + * repeat this crawl. By default crawls will not be repeated. + * + * @param int|float $input + * @return $this + */ + public function setRepeat($input) + { + if (!is_numeric($input) || !$input) { + throw new InvalidArgumentException('Only positive numbers allowed.'); + } + $this->otherOptions['repeat'] = (float)$input; + + return $this; + } + + /** + * By default repeat crawls will only process new (previously unprocessed) + * pages. Set to 0 to process all content on repeat crawls. + * + * @param int $int + * @return $this + */ + public function setOnlyProcessIfNew($int = 1) + { + $this->otherOptions['onlyProcessIfNew'] = (int)(bool)$int; + + return $this; + } + + /** + * Specify the maximum number of crawl repeats. By default (maxRounds=0) + * repeating crawls will continue indefinitely. + * + * @param int $input + * @return $this + */ + public function setMaxRounds($input = 0) + { + if ((int)$input < -1) { + $input = -1; + } + + $this->otherOptions['maxRounds'] = (int)$input; + + return $this; + } + + /** + * Ignores robots.txt if set to 0/false + * + * @param bool $bool + * @return $this + */ + public function setObeyRobots($bool = true) + { + $this->otherOptions['obeyRobots'] = (int)(bool)$bool; + + return $this; + } + + /** + * Force the start of a new crawl "round" (manually repeat the crawl). + * If onlyProcessIfNew is set to 1 (default), only newly-created pages will + * be processed. + * + * @param bool $commit + * @return EntityIterator + * @throws DiffbotException + */ + public function roundStart($commit = true) + { + $this->otherOptions = ['roundStart' => 1]; + + return ($commit) ? $this->call() : $this; + } + + /** + * Pause a crawl. + * + * @param bool $commit + * @return EntityIterator + * @throws DiffbotException + */ + public function pause($commit = true) + { + $this->otherOptions = ['pause' => 1]; + + return ($commit) ? $this->call() : $this; + } + + /** + * Pause a crawl. + * + * @param bool $commit + * @return EntityIterator + * @throws DiffbotException + */ + public function unpause($commit = true) + { + $this->otherOptions = ['pause' => 0]; + + return ($commit) ? $this->call() : $this; + } + + /** + * Restart removes all crawled data while maintaining crawl settings. + * + * @param bool $commit + * @return EntityIterator + * @throws DiffbotException + */ + public function restart($commit = true) + { + $this->otherOptions = ['restart' => 1]; + + return ($commit) ? $this->call() : $this; + } + + /** + * Delete a crawl, and all associated data, completely. + * + * @param bool $commit + * @return EntityIterator + * @throws DiffbotException + */ + public function delete($commit = true) + { + $this->otherOptions = ['delete' => 1]; + + return ($commit) ? $this->call() : $this; + } + + public function call() + { + $response = $this->diffbot->getHttpClient()->get($this->buildUrl()); + + $array = $response->json(); + + if (isset($array['jobs'])) { + $jobs = []; + foreach ($array['jobs'] as $job) { + $jobs[] = new JobCrawl($job); + } + + return new EntityIterator($jobs, $response); + } elseif (!isset($array['jobs']) && isset($array['response'])) { + return $array['response']; + } { + throw new DiffbotException('It appears something went wrong.'); + } + } + + /** + * Builds out the URL string that gets requested once `call()` is called + * + * @return string + */ + public function buildUrl() + { + + if (isset($this->otherOptions['urlProcessRegEx']) + && !empty($this->otherOptions['urlProcessRegEx']) + ) { + unset($this->otherOptions['urlProcessPattern']); + } + + if (isset($this->otherOptions['urlCrawlRegEx']) + && !empty($this->otherOptions['urlCrawlRegEx']) + ) { + unset($this->otherOptions['urlCrawlPattern']); + } + + $url = rtrim($this->apiUrl, '/') . '?'; + + // Add token + $url .= 'token=' . $this->diffbot->getToken(); + + if ($this->getName()) { + // Add name + $url .= '&name=' . $this->getName(); + + // Add seeds + $url .= '&seeds=' . implode(' ', array_map(function ($item) { + return urlencode($item); + }, $this->seeds)); + + // Add other options + foreach ($this->otherOptions as $option => $value) { + $url .= '&' . $option . '=' . $value; + } + + // Add API link + $url .= '&apiUrl=' . $this->getApiString(); + } + + return $url; + } + + /** + * @return string + */ + protected function getApiString() + { + if (!$this->api) { + $this->api = $this->diffbot->createAnalyzeAPI('crawl'); + $this->api->setMode('auto'); + } + + return urlencode($this->api->buildUrl()); + } +} \ No newline at end of file diff --git a/src/Api/Custom.php b/src/Api/Custom.php index 7a80c29..a23a06b 100644 --- a/src/Api/Custom.php +++ b/src/Api/Custom.php @@ -7,7 +7,7 @@ class Custom extends Api { /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3'; + protected $apiUrl = 'https://api.diffbot.com/v3'; public function __construct($url, $name) { diff --git a/src/Api/Discussion.php b/src/Api/Discussion.php index a0552c4..3316ec9 100644 --- a/src/Api/Discussion.php +++ b/src/Api/Discussion.php @@ -10,7 +10,7 @@ class Discussion extends Api use StandardApi; /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3/discussion'; + protected $apiUrl = 'https://api.diffbot.com/v3/discussion'; /** * Set the maximum number of pages in a thread to automatically concatenate diff --git a/src/Api/Image.php b/src/Api/Image.php index e87681f..ed64eb3 100644 --- a/src/Api/Image.php +++ b/src/Api/Image.php @@ -10,7 +10,7 @@ class Image extends Api use StandardApi; /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3/image'; + protected $apiUrl = 'https://api.diffbot.com/v3/image'; /** * Tells the API call to return the mentions field diff --git a/src/Api/Product.php b/src/Api/Product.php index 0109795..bb2b5ae 100644 --- a/src/Api/Product.php +++ b/src/Api/Product.php @@ -10,7 +10,7 @@ class Product extends Api use StandardApi; /** @var string API URL to which to send the request */ - protected $apiUrl = 'http://api.diffbot.com/v3/product'; + protected $apiUrl = 'https://api.diffbot.com/v3/product'; /** * If set to false, will not extract article comments in a Discussion diff --git a/src/Diffbot.php b/src/Diffbot.php index ed702ee..60da0bb 100644 --- a/src/Diffbot.php +++ b/src/Diffbot.php @@ -2,6 +2,7 @@ namespace Swader\Diffbot; +use Swader\Diffbot\Api\Crawl; use Swader\Diffbot\Api\Custom; use Swader\Diffbot\Exceptions\DiffbotException; use Swader\Diffbot\Api\Product; @@ -11,6 +12,7 @@ use Swader\Diffbot\Api\Discussion; use GuzzleHttp\Client; use Swader\Diffbot\Factory\Entity; +use Swader\Diffbot\Interfaces\Api; use Swader\Diffbot\Interfaces\EntityFactory; /** @@ -230,4 +232,24 @@ public function createCustomAPI($url, $name) return $api->registerDiffbot($this); } + /** + * Creates a new Crawljob with the given name. + * + * @see https://www.diffbot.com/dev/docs/crawl/ + * + * @param string $name Name of the crawljob. Needs to be unique. + * @param Api $api Optional instance of an API - if omitted, must be set + * later manually + * @return Crawl + */ + public function crawl($name = null, Api $api = null) + { + $api = new Crawl($name, $api); + if (!$this->getHttpClient()) { + $this->setHttpClient(); + $this->setEntityFactory(); + } + return $api->registerDiffbot($this); + } + } \ No newline at end of file diff --git a/src/Entity/EntityIterator.php b/src/Entity/EntityIterator.php index 0820432..5fc1b9e 100644 --- a/src/Entity/EntityIterator.php +++ b/src/Entity/EntityIterator.php @@ -65,13 +65,25 @@ public function valid() return ($this->cursor < $this->count()); } + protected function _getZerothEntity() + { + return ($this->cursor == -1) ? $this->data[0] : $this->current(); + } + public function __call($name, $args) { $isGetter = substr($name, 0, 3) == 'get'; - if ($isGetter) { - $property = lcfirst(substr($name, 3, strlen($name) - 3)); - return $this->$property; + if ($isGetter) { + $zeroth = $this->_getZerothEntity(); + if (method_exists($this->_getZerothEntity(), $name)) { + $rv = $zeroth->$name(...$args); + } else { + $property = lcfirst(substr($name, 3, strlen($name) - 3)); + $rv = $zeroth->$property; + } + + return $rv; } throw new \BadMethodCallException('No such method: ' . $name); @@ -79,7 +91,7 @@ public function __call($name, $args) public function __get($name) { - $entity = ($this->cursor == -1) ? $this->data[0] : $this->current(); + $entity = $this->_getZerothEntity(); return $entity->$name; } diff --git a/src/Entity/JobBulk.php b/src/Entity/JobBulk.php new file mode 100644 index 0000000..95692a1 --- /dev/null +++ b/src/Entity/JobBulk.php @@ -0,0 +1,9 @@ +data['maxToCrawl']; + } + + /** + * Maximum number of pages to process + * @see http://support.diffbot.com/crawlbot/whats-the-difference-between-crawling-and-processing/ + * + * @return int + */ + public function getMaxToProcess() + { + return (int)$this->data['maxToProcess']; + } + + /** + * Whether or not the job was set to only process newly found links, + * ignoring old but potentially updated ones + * + * @return bool + */ + public function getOnlyProcessIfNew() + { + return (bool)$this->data['onlyProcessIfNew']; + } + + /** + * Seed URLs provided to the job. Always returned as array. + * + * @return array + */ + public function getSeeds() + { + return explode(' ', $this->data['seeds']); + } +} \ No newline at end of file diff --git a/src/Interfaces/Api.php b/src/Interfaces/Api.php index 2852d63..67c68d2 100644 --- a/src/Interfaces/Api.php +++ b/src/Interfaces/Api.php @@ -1,10 +1,23 @@ diffbot = $d; + + return $this; + } +} \ No newline at end of file diff --git a/tests/Api/AnalyzeApiTest.php b/tests/Api/AnalyzeApiTest.php index 8f52f4d..abecf8e 100644 --- a/tests/Api/AnalyzeApiTest.php +++ b/tests/Api/AnalyzeApiTest.php @@ -50,7 +50,7 @@ public function testBuildUrlNoCustomFields() $url = $this ->apiWithMock ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com'; $this->assertEquals($expectedUrl, $url); } @@ -60,7 +60,7 @@ public function testBuildUrlOneCustomField() ->apiWithMock ->setMeta(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; $this->assertEquals($expectedUrl, $url); } @@ -71,7 +71,7 @@ public function testBuildUrlTwoCustomFields() ->setMeta(true) ->setLinks(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; $this->assertEquals($expectedUrl, $url); } @@ -84,7 +84,7 @@ public function testBuildUrlFourCustomFields() ->setBreadcrumb(true) ->setQuerystring(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring'; $this->assertEquals($expectedUrl, $url); } @@ -95,7 +95,7 @@ public function testBuildUrlOtherOptionsOnly() ->setMode('article') ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&discussion=false&mode=article'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&discussion=false&mode=article'; $this->assertEquals($expectedUrl, $url); } @@ -110,7 +110,7 @@ public function testBuildUrlOtherOptionsAndCustomFields() ->setDiscussion(false) ->setMode('product') ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring&discussion=false&mode=product'; + $expectedUrl = 'https://api.diffbot.com/v3/analyze?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring&discussion=false&mode=product'; $this->assertEquals($expectedUrl, $url); } diff --git a/tests/Api/ArticleApiTest.php b/tests/Api/ArticleApiTest.php index 0a2d5e6..0d02351 100644 --- a/tests/Api/ArticleApiTest.php +++ b/tests/Api/ArticleApiTest.php @@ -57,7 +57,7 @@ public function testBuildUrlNoCustomFields() $url = $this ->apiWithMock ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com'; $this->assertEquals($expectedUrl, $url); } @@ -67,7 +67,7 @@ public function testBuildUrlOneCustomField() ->apiWithMock ->setMeta(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; $this->assertEquals($expectedUrl, $url); } @@ -78,7 +78,7 @@ public function testBuildUrlTwoCustomFields() ->setMeta(true) ->setLinks(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; $this->assertEquals($expectedUrl, $url); } @@ -92,7 +92,7 @@ public function testBuildUrlFourCustomFields() ->setQuerystring(true) ->setSentiment(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring,sentiment'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring,sentiment'; $this->assertEquals($expectedUrl, $url); } @@ -104,7 +104,7 @@ public function testBuildUrlOtherOptionsOnly() ->setDiscussion(false) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&paging=false&maxTags=10&discussion=false'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&paging=false&maxTags=10&discussion=false'; $this->assertEquals($expectedUrl, $url); } @@ -119,7 +119,7 @@ public function testBuildUrlOtherOptionsAndCustomFields() ->setQuerystring(true) ->setMaxTags(10) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring&paging=false&maxTags=10'; + $expectedUrl = 'https://api.diffbot.com/v3/article?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring&paging=false&maxTags=10'; $this->assertEquals($expectedUrl, $url); } diff --git a/tests/Api/CrawlCustomMocksTest.php b/tests/Api/CrawlCustomMocksTest.php new file mode 100644 index 0000000..e880c03 --- /dev/null +++ b/tests/Api/CrawlCustomMocksTest.php @@ -0,0 +1,125 @@ +setEntityFactory(); + $fakeClient = new Client(); + $diffbot->setHttpClient($fakeClient); + $this->diffbot = $diffbot; + } + + public function testRoundStart() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + /** @var JobCrawl $j */ + $j = $c->roundStart(); + + $this->assertTrue($j->getRoundStartTime() == $j->getCurrentTime()); + $this->assertTrue($j->getPageCrawlInfo()['successesThisRound'] == 0); + } + + public function testRestart() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + /** @var JobCrawl $j */ + $j = $c->restart(); + + $this->assertTrue($j->getObjectsFound() == 0); + $this->assertTrue($j->getUrlsHarvested() == 0); + $this->assertTrue($j->getPageCrawlInfo()['successes'] == 0); + $this->assertTrue($j->getPageCrawlInfo()['attempts'] == 0); + $this->assertTrue($j->getPageCrawlInfo()['successesThisRound'] == 0); + $this->assertTrue($j->getPageProcessInfo()['successes'] == 0); + $this->assertTrue($j->getPageProcessInfo()['attempts'] == 0); + $this->assertTrue($j->getPageProcessInfo()['successesThisRound'] == 0); + } + + public function testPauseOn() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + /** @var JobCrawl $j */ + $j = $c->pause(); + + $this->assertEquals(6, $j->getJobStatus()['status']); + $this->assertEquals('Job paused.', $j->getJobStatus()['message']); + } + + public function testPauseOff() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + /** @var JobCrawl $j */ + $j = $c->unpause(); + + $this->assertEquals(7, $j->getJobStatus()['status']); + $this->assertEquals('Job is in progress.', $j->getJobStatus()['message']); + } + + public function testDelete() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/deletedSuccess.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + $this->assertEquals('Successfully deleted job.', $c->delete()); + } + + public function test500() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_name.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + $this->setExpectedException('GuzzleHttp\Exception\ServerException'); + $c->call(); + } + + public function testOtherError() + { + $this->diffbot->getHttpClient()->getEmitter()->attach(new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-20/invalid_response.json')] + )); + + $c = $this->diffbot->crawl('sitepoint_01'); + + $this->setExpectedException('Swader\Diffbot\Exceptions\DiffbotException'); + $c->call(); + } +} diff --git a/tests/Api/CrawlTest.php b/tests/Api/CrawlTest.php new file mode 100644 index 0000000..3b11c82 --- /dev/null +++ b/tests/Api/CrawlTest.php @@ -0,0 +1,378 @@ +getEmitter()->attach($this->getValidMock()); + + $diffbot->setHttpClient($fakeClient); + $diffbot->setEntityFactory(); + + $this->diffbot = $diffbot; + } + + protected function getValidMock() + { + if (!$this->validMock) { + $this->validMock = new Mock( + [file_get_contents(__DIR__ . '/../Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json')] + ); + } + + return $this->validMock; + } + + public function testBuildUrlListJobs() + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo'; + $c = $this->diffbot->crawl(); + $this->assertEquals($expected, $c->buildUrl()); + } + + public function testBuildUrlArticleApi() + { + $api = $this->diffbot->createArticleAPI('crawl')->setDiscussion(false); + + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Farticle%3Ftoken%3Ddemo%26url%3Dcrawl%26discussion%3Dfalse'; + $c = $this->diffbot->crawl('sitepoint_01', $api); + $c->setSeeds(['http://sitepoint.com']); + $this->assertEquals($expected, $c->buildUrl()); + } + + public function testBuildUrlDefaultApi() + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + $this->assertEquals($expected, $c->buildUrl()); + } + + public function testInvalidSeeds() + { + $this->setExpectedException('InvalidArgumentException'); + + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com' . 'foo', 'wakakakablah']); + } + + public function testPatternSetters() + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&pageProcessPattern=class%3DarticleBody&urlCrawlPattern=%2Fcategory%2Fshoes||%21%2Fauthor%2F||%5Ehttp%3A%2F%2Fwww.diffbot.com||type%3Dproduct%24&urlProcessPattern=%2Fproduct%2Fdetail||%21%3Fcurrency%3Deuro&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setPageProcessPatterns(['class=articleBody']); + $c->setUrlCrawlPatterns([ + '/category/shoes', + '!/author/', + '^http://www.diffbot.com', + 'type=product$' + ]); + $c->setUrlProcessPatterns(['/product/detail', '!?currency=euro']); + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function testRegexSetters() + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&urlCrawlRegEx=/^[a-z0-9_-]{3,16}$/&urlProcessRegEx=/^#?([a-f0-9]{6}|[a-f0-9]{3})$/&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setUrlCrawlRegEx('/^[a-z0-9_-]{3,16}$/'); + $c->setUrlProcessRegEx('/^#?([a-f0-9]{6}|[a-f0-9]{3})$/'); + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function maxHopsProvider() + { + return [ + [-100, "-1"], + [-1, "-1"], + [0, "0"], + [1, "1"], + [5, "5"], + [100, "100"] + ]; + } + + /** + * @dataProvider maxHopsProvider + * @param $input + * @param $urlFragment + */ + public function testMaxHops($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&maxHops=' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setMaxHops($input); + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function maxProvider() + { + return [ + [1, "1"], + [-1, "1"], + [0, "1"], + [5, "5"], + [500, "500"], + [1000000, "1000000"] + ]; + } + + /** + * @dataProvider maxProvider + * @param $input + * @param $urlFragment + */ + public function testMax($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&maxToCrawl=' . $urlFragment . '&maxToProcess=' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setMaxToCrawl($input); + $c->setMaxToProcess($input); + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function notifyProviderOk() + { + return [ + ['bruno@skvorc.me', 'notifyEmail=bruno@skvorc.me'], + [ + 'http://bruno.skvorc.me/somewebhook?diffbotIsDone=true', + 'notifyWebhook=http%3A%2F%2Fbruno.skvorc.me%2Fsomewebhook%3FdiffbotIsDone%3Dtrue' + ], + [ + [ + 'bruno@skvorc.me', + 'http://bruno.skvorc.me/somewebhook?diffbotIsDone=true' + ], + 'notifyEmail=bruno@skvorc.me¬ifyWebhook=http%3A%2F%2Fbruno.skvorc.me%2Fsomewebhook%3FdiffbotIsDone%3Dtrue' + ] + ]; + } + + /** + * @dataProvider notifyProviderOk + * @param $input + * @param $urlFragment + */ + public function testNotify($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + foreach ((array)$input as $i) { + $c->notify($i); + } + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function notifyProviderNotOk() + { + return [ + [5], + ['foo'], + ['htp:/someurl'] + ]; + } + + /** + * @dataProvider notifyProviderNotOk + * @param $input + */ + public function testNotifyFail($input) + { + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $this->setExpectedException('InvalidArgumentException'); + $c->notify($input); + } + + public function crawlDelayProviderOk() + { + return [ + [0.25, '0.25'], + [1, '1'], + [5, '5'], + [0, '0'], + [100, '100'], + [-5, '0.25'], + [-0.25, '0.25'] + ]; + } + + /** + * @dataProvider crawlDelayProviderOk + * @param $input + * @param $urlFragment + */ + public function testCrawlOk($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&crawlDelay=' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setCrawlDelay($input); + $this->assertEquals($expected, $c->buildUrl()); + } + + public function crawlDelayProviderNotOk() + { + return [ + ['foo'], + ['blah'], + ['0482kjvjs'], + ['cvojhshjvs4920'], + [true], + [false], + [null] + ]; + } + + /** + * @dataProvider crawlDelayProviderNotOk + * @param $input + */ + public function testCrawlNotOk($input) + { + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $this->setExpectedException('InvalidArgumentException'); + $c->setCrawlDelay($input); + } + + public function repeatProviderOk() + { + return [ + [1, '1'], + [5, '5'], + [0.25, '0.25'] + ]; + } + + /** + * @dataProvider repeatProviderOk + * @param $input + * @param $urlFragment + */ + public function testRepeatOk($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&repeat=' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setRepeat($input); + $this->assertEquals($expected, $c->buildUrl()); + + } + + public function repeatProviderNotOk() + { + return [ + [0], + ['foo'], + [false], + [null] + ]; + } + + /** + * @dataProvider repeatProviderNotOk + * @param $input + */ + public function testRepeatNotOk($input) + { + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $this->setExpectedException('InvalidArgumentException'); + $c->setRepeat($input); + } + + public function testOnlyProcessIfNew() + { + $expected1 = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&onlyProcessIfNew=1&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $expected2 = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&onlyProcessIfNew=0&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setOnlyProcessIfNew(1); + $this->assertEquals($expected1, $c->buildUrl()); + $c->setOnlyProcessIfNew(0); + $this->assertEquals($expected2, $c->buildUrl()); + } + + public function maxRoundsProvider() + { + return [ + [-100, "-1"], + [-1, "-1"], + [0, "0"], + [1, "1"], + [5, "5"], + [100, "100"] + ]; + } + + /** + * @dataProvider maxRoundsProvider + * @param $input + * @param $urlFragment + */ + public function testMaxRounds($input, $urlFragment) + { + $expected = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&maxRounds=' . $urlFragment . '&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + $c->setMaxRounds($input); + + $this->assertEquals($expected, $c->buildUrl()); + } + + public function testObeyRobots() + { + $expected1 = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&obeyRobots=1&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $expected2 = 'https://api.diffbot.com/v3/crawl?token=demo&name=sitepoint_01&seeds=http%3A%2F%2Fsitepoint.com&obeyRobots=0&apiUrl=https%3A%2F%2Fapi.diffbot.com%2Fv3%2Fanalyze%3Ftoken%3Ddemo%26url%3Dcrawl%26mode%3Dauto'; + $c = $this->diffbot->crawl('sitepoint_01'); + $c->setSeeds(['http://sitepoint.com']); + + + $c->setObeyRobots(); + $this->assertEquals($expected1, $c->buildUrl()); + + $c->setObeyRobots(0); + $this->assertEquals($expected2, $c->buildUrl()); + $c->setObeyRobots(false); + $this->assertEquals($expected2, $c->buildUrl()); + } +} diff --git a/tests/Api/CustomApiTest.php b/tests/Api/CustomApiTest.php index e85ed2e..3c36e97 100644 --- a/tests/Api/CustomApiTest.php +++ b/tests/Api/CustomApiTest.php @@ -47,23 +47,23 @@ public function apiNameProvider() return [ [ 'custom', - 'http://api.diffbot.com/v3/custom?token=demo&url=http%3A%2F%2Fsample-url.com' + 'https://api.diffbot.com/v3/custom?token=demo&url=http%3A%2F%2Fsample-url.com' ], [ 'authorFolioNew', - 'http://api.diffbot.com/v3/authorFolioNew?token=demo&url=http%3A%2F%2Fsample-url.com' + 'https://api.diffbot.com/v3/authorFolioNew?token=demo&url=http%3A%2F%2Fsample-url.com' ], [ 'authorFolioNew/something', - 'http://api.diffbot.com/v3/authorFolioNew/something?token=demo&url=http%3A%2F%2Fsample-url.com' + 'https://api.diffbot.com/v3/authorFolioNew/something?token=demo&url=http%3A%2F%2Fsample-url.com' ], [ 'my-api', - 'http://api.diffbot.com/v3/my-api?token=demo&url=http%3A%2F%2Fsample-url.com' + 'https://api.diffbot.com/v3/my-api?token=demo&url=http%3A%2F%2Fsample-url.com' ], [ 'my-api?param=value', - 'http://api.diffbot.com/v3/my-api?param=value?token=demo&url=http%3A%2F%2Fsample-url.com' + 'https://api.diffbot.com/v3/my-api?param=value?token=demo&url=http%3A%2F%2Fsample-url.com' ] ]; } @@ -96,7 +96,7 @@ public function testCreationAndUrl($name, $url) // public function testInvalidNames($name) // { // $this->setExpectedException('Swader\Diffbot\Exceptions\DiffbotException'); -// $this->diffbot->createCustomAPI('http://sample-url.com', $name); +// $this->diffbot->createCustomAPI('https://sample-url.com', $name); // } } diff --git a/tests/Api/DiscussionApiTest.php b/tests/Api/DiscussionApiTest.php index 209ec0f..1fc467b 100644 --- a/tests/Api/DiscussionApiTest.php +++ b/tests/Api/DiscussionApiTest.php @@ -54,7 +54,7 @@ public function testBuildUrlNoCustomFields() $url = $this ->apiWithMock ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com'; $this->assertEquals($expectedUrl, $url); } @@ -64,7 +64,7 @@ public function testBuildUrlOneCustomField() ->apiWithMock ->setMeta(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta'; $this->assertEquals($expectedUrl, $url); } @@ -75,7 +75,7 @@ public function testBuildUrlTwoCustomFields() ->setMeta(true) ->setLinks(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links'; $this->assertEquals($expectedUrl, $url); } @@ -89,7 +89,7 @@ public function testBuildUrlFourCustomFields() ->setQuerystring(true) ->setSentiment(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links,breadcrumb,querystring,sentiment'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links,breadcrumb,querystring,sentiment'; $this->assertEquals($expectedUrl, $url); } @@ -99,7 +99,7 @@ public function testBuildUrlOtherOptionsOnly() ->setMaxPages(10) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&maxPages=10'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&maxPages=10'; $this->assertEquals($expectedUrl, $url); } @@ -113,7 +113,7 @@ public function testBuildUrlOtherOptionsAndCustomFields() ->setQuerystring(true) ->setMaxPages('all') ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links,breadcrumb,querystring&maxPages=all'; + $expectedUrl = 'https://api.diffbot.com/v3/discussion?token=demo&url=https%3A%2F%2Fdiscussion-mock.com&fields=meta,links,breadcrumb,querystring&maxPages=all'; $this->assertEquals($expectedUrl, $url); } diff --git a/tests/Api/ImageApiTest.php b/tests/Api/ImageApiTest.php index 1096661..5242c8a 100644 --- a/tests/Api/ImageApiTest.php +++ b/tests/Api/ImageApiTest.php @@ -56,7 +56,7 @@ public function testBuildUrlNoCustomFields() $url = $this ->apiWithMock ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com'; + $expectedUrl = 'https://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com'; $this->assertEquals($expectedUrl, $url); } @@ -66,7 +66,7 @@ public function testBuildUrlOneCustomField() ->apiWithMock ->setMeta(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; + $expectedUrl = 'https://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta'; $this->assertEquals($expectedUrl, $url); } @@ -77,7 +77,7 @@ public function testBuildUrlTwoCustomFields() ->setMeta(true) ->setLinks(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; + $expectedUrl = 'https://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links'; $this->assertEquals($expectedUrl, $url); } @@ -93,7 +93,7 @@ public function testBuildUrlFourCustomFields() ->setFaces(true) ->setMentions(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring,ocr,faces,mentions'; + $expectedUrl = 'https://api.diffbot.com/v3/image?token=demo&url=https%3A%2F%2Farticle-mock.com&fields=meta,links,breadcrumb,querystring,ocr,faces,mentions'; $this->assertEquals($expectedUrl, $url); } diff --git a/tests/Api/ProductApiTest.php b/tests/Api/ProductApiTest.php index 83e19e2..9d84ace 100644 --- a/tests/Api/ProductApiTest.php +++ b/tests/Api/ProductApiTest.php @@ -54,7 +54,7 @@ public function testBuildUrlNoCustomFields() { $url = $this ->apiWithMock ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com'; + $expectedUrl = 'https://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com'; $this->assertEquals($expectedUrl, $url); } @@ -65,7 +65,7 @@ public function testBuildUrlMultipleCustomFields() { ->setSize(true) ->setAvailability(true) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com&fields=colors,size,availability'; + $expectedUrl = 'https://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com&fields=colors,size,availability'; $this->assertEquals($expectedUrl, $url); } @@ -77,7 +77,7 @@ public function testBuildUrlMultipleCustomFieldsAndOtherOptions() { ->setAvailability(true) ->setDiscussion(false) ->buildUrl(); - $expectedUrl = 'http://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com&fields=colors,size,availability&discussion=false'; + $expectedUrl = 'https://api.diffbot.com/v3/product?token=demo&url=https%3A%2F%2Fdogbrush-mock.com&fields=colors,size,availability&discussion=false'; $this->assertEquals($expectedUrl, $url); } } diff --git a/tests/DiffbotTest.php b/tests/DiffbotTest.php index 6ff2259..22260d5 100644 --- a/tests/DiffbotTest.php +++ b/tests/DiffbotTest.php @@ -135,4 +135,13 @@ public function testCustomApiCreation() ); } + public function testCrawlCreation() + { + $bot = new Diffbot('token'); + $api = $bot->crawl('test'); + $this->assertInstanceOf( + 'Swader\Diffbot\Api\Crawl', $api + ); + } + } \ No newline at end of file diff --git a/tests/Entity/CrawlJobTest.php b/tests/Entity/CrawlJobTest.php new file mode 100644 index 0000000..3faa36a --- /dev/null +++ b/tests/Entity/CrawlJobTest.php @@ -0,0 +1,639 @@ +prepareResponses(); + /** @var ResponseInterface $response */ + $response = $this->responses[$file]; + $jobs = []; + foreach ($response->json()['jobs'] as $data) { + $jobs[] = new Job($data); + } + + return new EntityIterator($jobs, $response); + } + + public function returnFiles() + { + $files = []; + foreach ($this->files as $file) { + $files[] = [$file]; + } + + return $files; + } + + /** + * @dataProvider returnFiles + */ + public function testType($file) + { + /** @var Image $entity */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals('crawl', $entity->getType()); + } + } + + public function nameProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 'sitepoint_01'] + ]; + } + + /** + * @dataProvider nameProvider + * @param $file + * @param $input + */ + public function testName($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getName()); + } + } + + public function timeProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + 1431865254, + 1431928375, + 1431981899, + 1431981899 + ] + ] + ]; + } + + /** + * @dataProvider timeProvider + * @param $file + * @param $input + */ + public function testTime($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input[0], $entity->getJobCreationTimeUTC()); + $this->assertEquals($input[1], $entity->getJobCompletionTimeUTC()); + $this->assertEquals($input[2], $entity->getCurrentTime()); + $this->assertEquals($input[3], $entity->getCurrentTimeUTC()); + } + } + + public function statusProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + "status" => 2, + "message" => "Job has reached maxToCrawl limit." + ] + ] + ]; + } + + /** + * @dataProvider statusProvider + * @param $file + * @param $input + */ + public function testStatus($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getJobStatus()); + } + } + + public function notificationSentProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 1] + ]; + } + + /** + * @dataProvider notificationSentProvider + * @param $file + * @param $input + */ + public function testNotificationSent($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getNotificationSent()); + } + } + + public function countProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + 91500, + 3219125, + 107872, + 100000, + 100000, + 91957, + 91500, + 91500 + ] + ] + ]; + } + + /** + * @dataProvider countProvider + * @param $file + * @param $input + */ + public function testCounts($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input[0], $entity->getObjectsFound()); + $this->assertEquals($input[1], $entity->getUrlsHarvested()); + $this->assertEquals($input[2], + $entity->getPageCrawlInfo()['attempts']); + $this->assertEquals($input[3], + $entity->getPageCrawlInfo()['successes']); + $this->assertEquals($input[4], + $entity->getPageCrawlInfo()['successesThisRound']); + $this->assertEquals($input[5], + $entity->getPageProcessInfo()['attempts']); + $this->assertEquals($input[6], + $entity->getPageProcessInfo()['successes']); + $this->assertEquals($input[7], + $entity->getPageProcessInfo()['successesThisRound']); + } + } + + public function maxRoundsProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', -1] + ]; + } + + /** + * @dataProvider maxRoundsProvider + * @param $file + * @param $input + */ + public function testMaxRounds($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getMaxRounds()); + } + } + + public function repeatProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 0] + ]; + } + + /** + * @dataProvider repeatProvider + * @param $file + * @param $input + */ + public function testRepeat($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getRepeat()); + } + } + + public function crawlDelayProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 0.25] + ]; + } + + /** + * @dataProvider crawlDelayProvider + * @param $file + * @param $input + */ + public function testDelay($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getCrawlDelay()); + } + } + + public function obeyRobotsProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 1] + ]; + } + + /** + * @dataProvider obeyRobotsProvider + * @param $file + * @param $input + */ + public function testObeyRobots($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getObeyRobots()); + } + } + + public function maxProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 100000, 100000] + ]; + } + + /** + * @dataProvider maxProvider + * @param $file + * @param $input1 + * @param $input2 + */ + public function testMax($file, $input1, $input2) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input1, $entity->getMaxToCrawl()); + $this->assertEquals($input2, $entity->getMaxToProcess()); + } + } + + public function processIfNewProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 1] + ]; + } + + /** + * @dataProvider processIfNewProvider + * @param $file + * @param $input + */ + public function testProcessNew($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getOnlyProcessIfNew()); + } + } + + public function seedsProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + ['http://sitepoint.com'] + ] + ]; + } + + /** + * @dataProvider seedsProvider + * @param $file + * @param $input + */ + public function testSeeds($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getSeeds()); + } + } + + public function roundsCompletedProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 0] + ]; + } + + /** + * @dataProvider roundsCompletedProvider + * @param $file + * @param $input + */ + public function testRoundsCompleted($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getRoundsCompleted()); + } + } + + public function roundStartTimeProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 0] + ]; + } + + /** + * @dataProvider roundStartTimeProvider + * @param $file + * @param $input + */ + public function testRoundStartTime($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getRoundStartTime()); + } + } + + public function apiUrlProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + 'http://api.diffbot.com/v3/article?&discussion=false' + ] + ]; + } + + /** + * @dataProvider apiUrlProvider + * @param $file + * @param $input + */ + public function testApiUrl($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getApiUrl()); + } + } + + public function patternProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + '', + '', + '' + ] + ] + ]; + } + + /** + * @dataProvider patternProvider + * @param $file + * @param $input + */ + public function testPatterns($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input[0], $entity->getUrlCrawlPattern()); + $this->assertEquals($input[1], $entity->getUrlProcessPattern()); + $this->assertEquals($input[2], $entity->getPageProcessPattern()); + } + } + + public function regexProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + '', + '' + ] + ] + ]; + } + + /** + * @dataProvider regexProvider + * @param $file + * @param $input + */ + public function testRegex($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input[0], $entity->getUrlCrawlRegex()); + $this->assertEquals($input[1], $entity->getUrlProcessRegex()); + } + } + + public function maxHopsProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', -1] + ]; + } + + /** + * @dataProvider maxHopsProvider + * @param $file + * @param $input + */ + public function testMaxHops($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input, $entity->getMaxHops()); + } + } + + public function downloadProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + [ + 'json' => 'http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json', + 'csv' => 'http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.csv', + 'debug' => 'http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv' + ] + ] + ]; + } + + /** + * @dataProvider downloadProvider + * @param $file + * @param $input + */ + public function testDownload($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input['json'], $entity->getDownloadUrl()); + $this->assertEquals($input['json'], + $entity->getDownloadUrl('json')); + $this->assertEquals($input['csv'], $entity->getDownloadUrl('csv')); + $this->assertEquals($input['debug'], + $entity->getDownloadUrl('debug')); + } + } + + public function notifyProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', ['', '']] + ]; + } + + /** + * @dataProvider notifyProvider + * @param $file + * @param $input + */ + public function testNotify($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->assertEquals($input[0], $entity->getNotifyEmail()); + $this->assertEquals($input[1], $entity->getNotifyWebhook()); + } + } + + public function downloadFailProvider() + { + return [ + [ + 'Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', + 'wrongkey' + ] + ]; + } + + /** + * @dataProvider downloadFailProvider + * @param $file + * @param $input + * @throws \Swader\Diffbot\Exceptions\DiffbotException + */ + public function testDownloadFail($file, $input) + { + /** + * @var int $i + * @var Job $entity + */ + foreach ($this->ei($file) as $i => $entity) { + $this->setExpectedException('InvalidArgumentException'); + $entity->getDownloadUrl($input); + } + } + + public function jobCountProvider() + { + return [ + ['Crawlbot/15-05-18/sitepoint_01_maxCrawled.json', 1], + ['Crawlbot/15-05-20/multiplejobs01.json', 2] + ]; + } + + /** + * @dataProvider jobCountProvider + * @param $file + * @param $input + */ + public function testCount($file, $input) + { + $this->assertEquals($input, $this->ei($file)->count()); + } +} diff --git a/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_deleted.json b/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_deleted.json new file mode 100644 index 0000000..0a8e810 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_deleted.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Tue, 19 May 2015 21:22:24 GMT +Content-Type: application/json +Content-Length: 40 +Connection: keep-alive +Access-Control-Allow-Origin: * +Last-Modified: Tue, 19 May 2015 21:23:03 GMT +Access-Control-Allow-Origin: * + +{"response":"Successfully deleted job."} \ No newline at end of file diff --git a/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json b/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json new file mode 100644 index 0000000..18ae4db --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-18/sitepoint_01_maxCrawled.json @@ -0,0 +1,58 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Mon, 18 May 2015 20:44:20 GMT +Content-Type: application/json +Content-Length: 1205 +Connection: keep-alive +Access-Control-Allow-Origin: * +Pragma: no-cache +Expires: -1 +Last-Modified: Mon, 18 May 2015 20:44:59 GMT +Access-Control-Allow-Origin: * + +{ +"response":"Successfully added urls for spidering.", + +"jobs":[ + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1431865254, +"jobCompletionTimeUTC":1431928375, +"jobStatus":{"status":2,"message":"Job has reached maxToCrawl limit."}, +"sentJobDoneNotification":1, +"objectsFound":91500, +"urlsHarvested":3219125, +"pageCrawlAttempts":107872, +"pageCrawlSuccesses":100000, +"pageCrawlSuccessesThisRound":100000, +"pageProcessAttempts":91957, +"pageProcessSuccesses":91500, +"pageProcessSuccessesThisRound":91500, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1431981899, +"currentTimeUTC":1431981899, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +} diff --git a/tests/Mocks/Crawlbot/15-05-20/deletedSuccess.json b/tests/Mocks/Crawlbot/15-05-20/deletedSuccess.json new file mode 100644 index 0000000..a234370 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/deletedSuccess.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 21:43:21 GMT +Content-Type: application/json +Content-Length: 40 +Connection: keep-alive +Access-Control-Allow-Origin: * +Last-Modified: Wed, 20 May 2015 21:44:01 GMT +Access-Control-Allow-Origin: * + +{"response":"Successfully deleted job."} \ No newline at end of file diff --git a/tests/Mocks/Crawlbot/15-05-20/invalid_name.json b/tests/Mocks/Crawlbot/15-05-20/invalid_name.json new file mode 100644 index 0000000..944cb94 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/invalid_name.json @@ -0,0 +1,10 @@ +HTTP/1.1 500 +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 22:43:55 GMT +Content-Type: application/json +Content-Length: 44 +Connection: keep-alive +Access-Control-Allow-Origin: * +Last-Modified: Wed, 20 May 2015 22:44:35 GMT + +{"error":"crawlbot: name is over 30 chars"} diff --git a/tests/Mocks/Crawlbot/15-05-20/invalid_response.json b/tests/Mocks/Crawlbot/15-05-20/invalid_response.json new file mode 100644 index 0000000..c4af886 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/invalid_response.json @@ -0,0 +1,11 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 21:43:21 GMT +Content-Type: application/json +Content-Length: 40 +Connection: keep-alive +Access-Control-Allow-Origin: * +Last-Modified: Wed, 20 May 2015 21:44:01 GMT +Access-Control-Allow-Origin: * + +{"foo":"bar"} \ No newline at end of file diff --git a/tests/Mocks/Crawlbot/15-05-20/multiplejobs01.json b/tests/Mocks/Crawlbot/15-05-20/multiplejobs01.json new file mode 100644 index 0000000..328be18 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/multiplejobs01.json @@ -0,0 +1,96 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 21:04:33 GMT +Content-Type: application/json +Content-Length: 2242 +Connection: keep-alive +Pragma: no-cache +Expires: -1 +Last-Modified: Wed, 20 May 2015 21:05:13 GMT +Access-Control-Allow-Origin: * + +{ +"jobs":[ + +{"name":"sitepoint_0", +"type":"crawl", +"jobCreationTimeUTC":1432069878, +"jobCompletionTimeUTC":1432130762, +"jobStatus":{"status":2,"message":"Job has reached maxToCrawl limit."}, +"sentJobDoneNotification":1, +"objectsFound":92096, +"urlsHarvested":3386071, +"pageCrawlAttempts":107716, +"pageCrawlSuccesses":100003, +"pageCrawlSuccessesThisRound":100003, +"pageProcessAttempts":92441, +"pageProcessSuccesses":92096, +"pageProcessSuccessesThisRound":92096, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1432155913, +"currentTimeUTC":1432155913, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_0_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_0_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} +, + + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1432155773, +"jobCompletionTimeUTC":0, +"jobStatus":{"status":7,"message":"Job is in progress."}, +"sentJobDoneNotification":0, +"objectsFound":95, +"urlsHarvested":10282, +"pageCrawlAttempts":117, +"pageCrawlSuccesses":117, +"pageCrawlSuccessesThisRound":117, +"pageProcessAttempts":115, +"pageProcessSuccesses":95, +"pageProcessSuccessesThisRound":95, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1432155913, +"currentTimeUTC":1432155913, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +} diff --git a/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json new file mode 100644 index 0000000..14c45ca --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_paused.json @@ -0,0 +1,56 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 22:11:49 GMT +Content-Type: application/json +Content-Length: 1091 +Connection: keep-alive +Access-Control-Allow-Origin: * +Pragma: no-cache +Expires: -1 +Last-Modified: Wed, 20 May 2015 22:12:29 GMT +Access-Control-Allow-Origin: * + +{ +"jobs":[ + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1432159882, +"jobCompletionTimeUTC":0, +"jobStatus":{"status":6,"message":"Job paused."}, +"sentJobDoneNotification":0, +"objectsFound":9, +"urlsHarvested":283, +"pageCrawlAttempts":10, +"pageCrawlSuccesses":10, +"pageCrawlSuccessesThisRound":10, +"pageProcessAttempts":10, +"pageProcessSuccesses":10, +"pageProcessSuccessesThisRound":10, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1432159949, +"currentTimeUTC":1432159949, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +} diff --git a/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json new file mode 100644 index 0000000..aa6a35b --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_restart.json @@ -0,0 +1,58 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 22:13:03 GMT +Content-Type: application/json +Content-Length: 1146 +Connection: keep-alive +Access-Control-Allow-Origin: * +Pragma: no-cache +Expires: -1 +Last-Modified: Wed, 20 May 2015 22:13:43 GMT +Access-Control-Allow-Origin: * + +{ +"response":"Successfully added urls for spidering.", + +"jobs":[ + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1432160023, +"jobCompletionTimeUTC":0, +"jobStatus":{"status":0,"message":"Job is initializing."}, +"sentJobDoneNotification":0, +"objectsFound":0, +"urlsHarvested":0, +"pageCrawlAttempts":0, +"pageCrawlSuccesses":0, +"pageCrawlSuccessesThisRound":0, +"pageProcessAttempts":0, +"pageProcessSuccesses":0, +"pageProcessSuccessesThisRound":0, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1432160023, +"currentTimeUTC":1432160023, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +} diff --git a/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json new file mode 100644 index 0000000..8160e48 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_roundstart.json @@ -0,0 +1,56 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 22:12:40 GMT +Content-Type: application/json +Content-Length: 1113 +Connection: keep-alive +Access-Control-Allow-Origin: * +Pragma: no-cache +Expires: -1 +Last-Modified: Wed, 20 May 2015 22:13:20 GMT +Access-Control-Allow-Origin: * + +{ +"jobs":[ + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1432159882, +"jobCompletionTimeUTC":0, +"jobStatus":{"status":7,"message":"Job is in progress."}, +"sentJobDoneNotification":0, +"objectsFound":106, +"urlsHarvested":7420, +"pageCrawlAttempts":166, +"pageCrawlSuccesses":163, +"pageCrawlSuccessesThisRound":0, +"pageProcessAttempts":124, +"pageProcessSuccesses":110, +"pageProcessSuccessesThisRound":0, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":3, +"roundStartTime":1432160000, +"currentTime":1432160000, +"currentTimeUTC":1432160000, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +} diff --git a/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json new file mode 100644 index 0000000..61e1e69 --- /dev/null +++ b/tests/Mocks/Crawlbot/15-05-20/sitepoint_01_unpaused.json @@ -0,0 +1,56 @@ +HTTP/1.1 200 OK +Server: nginx/1.6.3 +Date: Wed, 20 May 2015 22:11:57 GMT +Content-Type: application/json +Content-Length: 1099 +Connection: keep-alive +Access-Control-Allow-Origin: * +Pragma: no-cache +Expires: -1 +Last-Modified: Wed, 20 May 2015 22:12:37 GMT +Access-Control-Allow-Origin: * + +{ +"jobs":[ + +{"name":"sitepoint_01", +"type":"crawl", +"jobCreationTimeUTC":1432159882, +"jobCompletionTimeUTC":0, +"jobStatus":{"status":7,"message":"Job is in progress."}, +"sentJobDoneNotification":0, +"objectsFound":9, +"urlsHarvested":283, +"pageCrawlAttempts":10, +"pageCrawlSuccesses":10, +"pageCrawlSuccessesThisRound":10, +"pageProcessAttempts":10, +"pageProcessSuccesses":10, +"pageProcessSuccessesThisRound":10, +"maxRounds":-1, +"repeat":0.000000, +"crawlDelay":0.250000, +"obeyRobots":1, +"maxToCrawl":100000, +"maxToProcess":100000, +"onlyProcessIfNew":1, +"seeds":"http://sitepoint.com", +"roundsCompleted":0, +"roundStartTime":0, +"currentTime":1432159957, +"currentTimeUTC":1432159957, +"apiUrl":"http://api.diffbot.com/v3/article?&discussion=false", +"urlCrawlPattern":"", +"urlProcessPattern":"", +"pageProcessPattern":"", +"urlCrawlRegEx":"", +"urlProcessRegEx":"", +"maxHops":-1, +"downloadJson":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_data.json", +"downloadUrls":"http://api.diffbot.com/v3/crawl/download/xxxxxxxxxxx-sitepoint_01_urls.csv", +"notifyEmail":"", +"notifyWebhook":"" +} + +] +}