diff --git a/src/calibre/gui2/store/__init__.py b/src/calibre/gui2/store/__init__.py index 0a7baf9807ee..946cbf5e40f6 100644 --- a/src/calibre/gui2/store/__init__.py +++ b/src/calibre/gui2/store/__init__.py @@ -2,9 +2,86 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' +from contextlib import closing +from time import perf_counter + +from lxml import html + +from calibre import browser as create_browser, prints +from calibre.constants import DEBUG +from calibre.scraper.simple import read_url from calibre.utils.filenames import ascii_filename +def browser_get_url(url, timeout, browser=None, user_agent=None, headers=None, data=None, novisit=False, html_parser=None, save_html_to=None): + """ + Retrieve the content at the given HTTP URL, + and measure the time it takes to do so in DEBUG mode. + Uses mechanize.Browser + + :param url: a URL string. + + :param timeout: a numerical timeout in seconds for the HTTP request. + + :param browser: an optional existing mechanize.Browser instance. + If not provided, a new one will be created. + + :param user_agent: optional User-Agent to use if no "browser" parameter is provided. + + :param headers: optional list of HTTP headers to set on the request + + :param data: optional query parameters + + :param novisit: optional boolean indicating to use mechanize "novisit" method + when fetching web pages. + + :param save_html_to: an optional file path where to save the web page content. + + :param html_parser: an optional function to parse the HTML string. + By default: lxml.html.fromstring + + :return: a parsed HTML element/document + """ + start_time = perf_counter() + if browser is None: + browser = create_browser(user_agent=user_agent) + if headers: + browser.addheaders.extend(headers) + browser_open = browser.open_novisit if novisit else browser.open + with closing(browser_open(url, data=data, timeout=timeout)) as web_page: + html_content = web_page.read() + if save_html_to: + with open(save_html_to, 'wb') as html_file: + html_file.write(raw_content) + if not html_parser: + html_parser = html.fromstring + html_parsed = html_parser(html_content) + if DEBUG: + duration = perf_counter() - start_time + prints(f'browser_get_url took {duration:.2f}s for URL {url}') + return html_parsed + + +def http_get_url(storage, url, timeout): + """ + Retrieve the content at the given HTTP URL, + and measure the time it takes to do so in DEBUG mode. + Uses qt.webengine and hence the chromium network stack. + + :param url: a URL string. + + :param timeout: a numerical timeout in seconds for the HTTP request. + + :return: the HTML content as a string + """ + start_time = perf_counter() + html_content = read_url(storage, url, timeout) + if DEBUG: + duration = perf_counter() - start_time + prints(f"http_get_url took {duration:.2f}s for URL {url}") + return html_content + + class StorePlugin: # {{{ ''' diff --git a/src/calibre/gui2/store/amazon_base.py b/src/calibre/gui2/store/amazon_base.py index 281815efb2ba..037c547f3a9b 100644 --- a/src/calibre/gui2/store/amazon_base.py +++ b/src/calibre/gui2/store/amazon_base.py @@ -2,7 +2,6 @@ # vim:fileencoding=utf-8 # License: GPL v3 Copyright: 2022, Kovid Goyal -from qt.core import QUrl from threading import Lock from time import monotonic @@ -45,7 +44,7 @@ class AmazonStore: def open(self, parent=None, detail_item=None, external=False): store_link = get_method('get_store_link_amazon')(self, detail_item) - open_url(QUrl(store_link)) + open_url(store_link) def search(self, query, max_results=10, timeout=60): for result in get_method('search_amazon')(self, query, max_results=max_results, timeout=timeout): diff --git a/src/calibre/gui2/store/amazon_live.py b/src/calibre/gui2/store/amazon_live.py index 848e21eae93c..8b88db4a516e 100644 --- a/src/calibre/gui2/store/amazon_live.py +++ b/src/calibre/gui2/store/amazon_live.py @@ -6,7 +6,7 @@ from lxml import etree, html from urllib.parse import urlencode -from calibre.scraper.simple import read_url +from calibre.gui2.store import http_get_url from calibre.gui2.store.search_result import SearchResult @@ -26,7 +26,7 @@ def asbytes(x): url = self.SEARCH_BASE_URL + '?' + urlencode(uquery) counter = max_results - raw = read_url(self.scraper_storage, url, timeout=timeout) + raw = http_get_url(self.scraper_storage, url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) @@ -85,7 +85,7 @@ def parse_details_amazon(self, idata, search_result): def get_details_amazon(self, search_result, timeout): url = self.DETAILS_URL + search_result.detail_item - raw = read_url(self.scraper_storage, url, timeout=timeout) + raw = http_get_url(self.scraper_storage, url, timeout=timeout) idata = html.fromstring(raw) return parse_details_amazon(self, idata, search_result) diff --git a/src/calibre/gui2/store/opensearch_store.py b/src/calibre/gui2/store/opensearch_store.py index 110ed81e0b31..06243be7901d 100644 --- a/src/calibre/gui2/store/opensearch_store.py +++ b/src/calibre/gui2/store/opensearch_store.py @@ -4,8 +4,6 @@ from contextlib import closing -from qt.core import QUrl - from calibre import (browser, guess_extension) from calibre.gui2 import open_url from calibre.utils.xml_parse import safe_xml_fromstring @@ -88,7 +86,7 @@ def open(self, parent=None, detail_item=None, external=False): return if external or self.config.get('open_external', False): - open_url(QUrl(detail_item if detail_item else self.web_url)) + open_url(detail_item if detail_item else self.web_url) else: d = WebStoreDialog(self.gui, self.web_url, parent, detail_item, create_browser=self.create_browser) d.setWindowTitle(self.name) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 21b22b2a8076..3e270a175040 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -5,19 +5,13 @@ store_version = 15 # Needed for dynamic plugin loading -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.search_result import SearchResult SEARCH_BASE_URL = 'https://www.amazon.de/s/' @@ -49,102 +43,93 @@ def asbytes(x): return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) + + doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to) + + try: + results = doc.xpath('//div[@id="atfResults" and @class]')[0] + except IndexError: + return + + if 's-result-list-parent-container' in results.get('class', ''): + data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" + format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION + asin_xpath = '@data-asin' + cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" + title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" + author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY + price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' + ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + else: + return counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[contains(text(), "%s")]//text()' % KINDLE_EDITION - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + for data in doc.xpath(data_xpath): + if counter <= 0: + break + + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format.lower(): + continue + + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + continue - cover_url = ''.join(data.xpath(cover_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass + title = ''.join(data.xpath(title_xpath)) + author = ''.join(data.xpath(author_xpath)) + try: + author = author.split('by ', 1)[1].split(" (")[0] + except: + pass - price = ''.join(data.xpath(price_xpath)) + price = ''.join(data.xpath(price_xpath)) - counter -= 1 + counter -= 1 - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.formats = 'Kindle' - yield s + yield s class AmazonKindleStore(StorePlugin): def open(self, parent=None, detail_item=None, external=False): store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) + open_url(store_link) def search(self, query, max_results=10, timeout=60): for result in search_amazon(query, max_results=max_results, timeout=timeout): yield result def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent()) + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + DRM_SEARCH_TEXT + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + DRM_FREE_TEXT + '") and contains(b, "' + DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN + search_result.drm = SearchResult.DRM_UNLOCKED else: - search_result.drm = SearchResult.DRM_LOCKED + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED return True diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index d9fca8e33cea..29f18d3e0e78 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -5,19 +5,13 @@ store_version = 15 # Needed for dynamic plugin loading -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.search_result import SearchResult SEARCH_BASE_URL = 'https://www.amazon.es/s/' @@ -49,102 +43,93 @@ def asbytes(x): return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) + + doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to) + + try: + results = doc.xpath('//div[@id="atfResults" and @class]')[0] + except IndexError: + return + + if 's-result-list-parent-container' in results.get('class', ''): + data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" + format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION + asin_xpath = '@data-asin' + cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" + title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" + author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY + price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' + ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + else: + return counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + for data in doc.xpath(data_xpath): + if counter <= 0: + break + + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format.lower(): + continue + + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + continue - cover_url = ''.join(data.xpath(cover_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass + title = ''.join(data.xpath(title_xpath)) + author = ''.join(data.xpath(author_xpath)) + try: + author = author.split('by ', 1)[1].split(" (")[0] + except: + pass - price = ''.join(data.xpath(price_xpath)) + price = ''.join(data.xpath(price_xpath)) - counter -= 1 + counter -= 1 - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.formats = 'Kindle' - yield s + yield s class AmazonKindleStore(StorePlugin): def open(self, parent=None, detail_item=None, external=False): store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) + open_url(store_link) def search(self, query, max_results=10, timeout=60): for result in search_amazon(query, max_results=max_results, timeout=timeout): yield result def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent()) + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + DRM_SEARCH_TEXT + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + DRM_FREE_TEXT + '") and contains(b, "' + DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN + search_result.drm = SearchResult.DRM_UNLOCKED else: - search_result.drm = SearchResult.DRM_LOCKED + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED return True diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index f61624fbd998..02b7c5f5f9d8 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -5,19 +5,15 @@ store_version = 16 # Needed for dynamic plugin loading -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html, etree +from lxml import etree -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.search_result import SearchResult SEARCH_BASE_URL = 'https://www.amazon.fr/s/' @@ -49,79 +45,72 @@ def asbytes(x): return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) + doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): - kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION))) - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - if 'kindle' not in kformat.lower(): - continue - asin = result.get('data-asin') - if not asin: - continue - - cover_url = ''.join(result.xpath('.//img/@src')) - title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') - adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] - aparts = etree.tostring(adiv, method='text', encoding='unicode').split() - idx = aparts.index(BY) - author = ' '.join(aparts[idx+1:]).split('|')[0].strip() - price = '' - for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): - q = ''.join(span.xpath('./text()')) - if q: - price = q - break - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.detail_item = asin.strip() - s.price = price.strip() - s.formats = 'Kindle' - - yield s + for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): + if counter <= 0: + break + + kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION))) + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + if 'kindle' not in kformat.lower(): + continue + asin = result.get('data-asin') + if not asin: + continue + + cover_url = ''.join(result.xpath('.//img/@src')) + title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') + adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] + aparts = etree.tostring(adiv, method='text', encoding='unicode').split() + idx = aparts.index(BY) + author = ' '.join(aparts[idx+1:]).split('|')[0].strip() + price = '' + for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): + q = ''.join(span.xpath('./text()')) + if q: + price = q + break + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.detail_item = asin.strip() + s.price = price.strip() + s.formats = 'Kindle' + + yield s class AmazonKindleStore(StorePlugin): def open(self, parent=None, detail_item=None, external=False): store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) + open_url(store_link) def search(self, query, max_results=10, timeout=60): for result in search_amazon(query, max_results=max_results, timeout=timeout): yield result def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent()) + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + DRM_SEARCH_TEXT + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + DRM_FREE_TEXT + '") and contains(b, "' + DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN + search_result.drm = SearchResult.DRM_UNLOCKED else: - search_result.drm = SearchResult.DRM_LOCKED + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED return True diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index 58c72083fac9..4150119e2801 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -5,19 +5,13 @@ store_version = 15 # Needed for dynamic plugin loading -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.search_result import SearchResult SEARCH_BASE_URL = 'https://www.amazon.it/s/' @@ -49,102 +43,93 @@ def asbytes(x): return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) - br = browser(user_agent=get_user_agent()) + + doc = browser_get_url(url, timeout, user_agent=get_user_agent(), save_html_to=write_html_to) + + try: + results = doc.xpath('//div[@id="atfResults" and @class]')[0] + except IndexError: + return + + if 's-result-list-parent-container' in results.get('class', ''): + data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" + format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION + asin_xpath = '@data-asin' + cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" + title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" + author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY + price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' + ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + else: + return counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if write_html_to is not None: - with open(write_html_to, 'wb') as f: - f.write(raw) - doc = html.fromstring(raw) - try: - results = doc.xpath('//div[@id="atfResults" and @class]')[0] - except IndexError: - return - - if 's-result-list-parent-container' in results.get('class', ''): - data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" - format_xpath = './/a[@title="%s"]/@title' % KINDLE_EDITION - asin_xpath = '@data-asin' - cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" - title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" - author_xpath = './/span[starts-with(text(), "%s ")]/following-sibling::span//text()' % BY - price_xpath = ('descendant::div[@class="a-row a-spacing-none" and' - ' not(span[contains(@class, "kindle-unlimited")])]//span[contains(@class, "s-price")]//text()') + for data in doc.xpath(data_xpath): + if counter <= 0: + break + + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (author pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format.lower(): + continue + + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] else: - return - - for data in doc.xpath(data_xpath): - if counter <= 0: - break - - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (author pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format.lower(): - continue - - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + continue - cover_url = ''.join(data.xpath(cover_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - title = ''.join(data.xpath(title_xpath)) - author = ''.join(data.xpath(author_xpath)) - try: - author = author.split('by ', 1)[1].split(" (")[0] - except: - pass + title = ''.join(data.xpath(title_xpath)) + author = ''.join(data.xpath(author_xpath)) + try: + author = author.split('by ', 1)[1].split(" (")[0] + except: + pass - price = ''.join(data.xpath(price_xpath)) + price = ''.join(data.xpath(price_xpath)) - counter -= 1 + counter -= 1 - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.formats = 'Kindle' + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.formats = 'Kindle' - yield s + yield s class AmazonKindleStore(StorePlugin): def open(self, parent=None, detail_item=None, external=False): store_link = (DETAILS_URL + detail_item) if detail_item else STORE_LINK - open_url(QUrl(store_link)) + open_url(store_link) def search(self, query, max_results=10, timeout=60): for result in search_amazon(query, max_results=max_results, timeout=timeout): yield result def get_details(self, search_result, timeout): - url = DETAILS_URL - - br = browser(user_agent=get_user_agent()) - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + idata = browser_get_url(DETAILS_URL + search_result.detail_item, timeout, user_agent=get_user_agent()) + if idata.xpath('boolean(//div[@class="content"]//li/b[contains(text(), "' + + DRM_SEARCH_TEXT + '")])'): + if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + + DRM_FREE_TEXT + '") and contains(b, "' + DRM_SEARCH_TEXT + '")])'): - if idata.xpath('boolean(//div[@class="content"]//li[contains(., "' + - DRM_FREE_TEXT + '") and contains(b, "' + - DRM_SEARCH_TEXT + '")])'): - search_result.drm = SearchResult.DRM_UNLOCKED - else: - search_result.drm = SearchResult.DRM_UNKNOWN + search_result.drm = SearchResult.DRM_UNLOCKED else: - search_result.drm = SearchResult.DRM_LOCKED + search_result.drm = SearchResult.DRM_UNKNOWN + else: + search_result.drm = SearchResult.DRM_LOCKED return True diff --git a/src/calibre/gui2/store/stores/baen_webscription_plugin.py b/src/calibre/gui2/store/stores/baen_webscription_plugin.py index d56b7243e9a5..5ce2fa95e19b 100644 --- a/src/calibre/gui2/store/stores/baen_webscription_plugin.py +++ b/src/calibre/gui2/store/stores/baen_webscription_plugin.py @@ -7,20 +7,14 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.ebooks.metadata import authors_to_string from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -30,49 +24,46 @@ def search(query, max_results=10, timeout=60): url = 'http://www.baen.com/catalogsearch/result/?' + urlencode( {'q':query.lower(), 'dir':'desc', 'order':'relevance'}) - br = browser() + root = browser_get_url(url, timeout, novisit=True) counter = max_results - with closing(br.open_novisit(url, timeout=timeout)) as f: - raw = f.read() - root = html.fromstring(raw) - for data in root.xpath('//div[@id="productMatches"]//table[@id="authorTable"]//tr[contains(@class, "IDCell")]'): - if counter <= 0: - break - - try: - book_url = data.xpath('./td[1]/a/@href[1]')[0] - except IndexError: - continue - - try: - title = data.xpath('./td[2]/a[1]/text()')[0].strip() - except IndexError: - continue - try: - cover_url = data.xpath('./td[1]//img[1]/@src')[0] - except IndexError: - cover_url = '' - - tails = [(b.tail or '').strip() for b in data.xpath('./td[2]/br')] - authors = [x[2:].strip() for x in tails if x.startswith('by ')] - author = authors_to_string(authors) - price = ''.join(data.xpath('.//span[@class="variantprice"]/text()')) - a, b, price = price.partition('$') - price = b + price - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = book_url.strip() - s.drm = SearchResult.DRM_UNLOCKED - s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML' - - yield s + for data in root.xpath('//div[@id="productMatches"]//table[@id="authorTable"]//tr[contains(@class, "IDCell")]'): + if counter <= 0: + break + + try: + book_url = data.xpath('./td[1]/a/@href[1]')[0] + except IndexError: + continue + + try: + title = data.xpath('./td[2]/a[1]/text()')[0].strip() + except IndexError: + continue + try: + cover_url = data.xpath('./td[1]//img[1]/@src')[0] + except IndexError: + cover_url = '' + + tails = [(b.tail or '').strip() for b in data.xpath('./td[2]/br')] + authors = [x[2:].strip() for x in tails if x.startswith('by ')] + author = authors_to_string(authors) + price = ''.join(data.xpath('.//span[@class="variantprice"]/text()')) + a, b, price = price.partition('$') + price = b + price + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = book_url.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML' + + yield s class BaenWebScriptionStore(BasicStoreConfig, StorePlugin): @@ -80,7 +71,7 @@ class BaenWebScriptionStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'http://www.baenebooks.com/' if external or self.config.get('open_external', False): - open_url(QUrl(detail_item or url)) + open_url(detail_item or url) else: d = WebStoreDialog(self.gui, url, parent, detail_item or url) d.setWindowTitle(self.name) diff --git a/src/calibre/gui2/store/stores/beam_ebooks_de_plugin.py b/src/calibre/gui2/store/stores/beam_ebooks_de_plugin.py index 1875e890feeb..4a7d47a42baa 100644 --- a/src/calibre/gui2/store/stores/beam_ebooks_de_plugin.py +++ b/src/calibre/gui2/store/stores/beam_ebooks_de_plugin.py @@ -11,15 +11,9 @@ from urllib.parse import quote except ImportError: from urllib2 import quote -from contextlib import closing -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -33,7 +27,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = detail_item - open_url(QUrl(url)) + open_url(url) else: detail_url = None if detail_item: @@ -45,32 +39,30 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + quote(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[contains(@class, "product--box")]'): - if counter <= 0: - break + for data in doc.xpath('//div[contains(@class, "product--box")]'): + if counter <= 0: + break - id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() - if not id_: - continue - cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) - if cover_url: - cover_url = cover_url.split(',')[0].strip() - author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() - title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() - price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() - counter -= 1 + id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() + if not id_: + continue + cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) + if cover_url: + cover_url = cover_url.split(',')[0].strip() + author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() + title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() + price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() + counter -= 1 - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.drm = SearchResult.DRM_UNLOCKED - s.detail_item = id_ -# s.formats = None - yield s + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.drm = SearchResult.DRM_UNLOCKED + s.detail_item = id_ +# s.formats = None + yield s diff --git a/src/calibre/gui2/store/stores/biblio_plugin.py b/src/calibre/gui2/store/stores/biblio_plugin.py index 63e709685726..b5959ca0fdc5 100644 --- a/src/calibre/gui2/store/stores/biblio_plugin.py +++ b/src/calibre/gui2/store/stores/biblio_plugin.py @@ -12,14 +12,11 @@ except ImportError: from urllib import quote_plus -from calibre import browser from calibre.gui2 import open_url +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig -from calibre.gui2.store import StorePlugin from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog -from contextlib import closing -from lxml import html class BiblioStore(BasicStoreConfig, StorePlugin): @@ -47,54 +44,49 @@ def search(self, query, max_results=10, timeout=60): yield from self._do_search(url, max_results, timeout) def get_details(self, search_result, timeout): - br = browser() - with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - search_result.formats = '' - search_result.drm = SearchResult.DRM_LOCKED - - for option in idata.xpath('//ul[@class="order_product_options"]/li'): - option_type = option.text.strip() if option.text else '' - if option_type.startswith('Формат:'): - search_result.formats = ''.join(option.xpath('.//b/text()')).strip() - if option_type.startswith('Защита:'): - if ''.join(option.xpath('.//b/text()')).strip() == 'няма': - search_result.drm = SearchResult.DRM_UNLOCKED - - if not search_result.author: - search_result.author = ', '.join(idata.xpath('//div[@class="row product_info"]/div/div/div[@class="item-author"]/a/text()')).strip(', ') + idata = browser_get_url(search_result.detail_item, timeout) + search_result.formats = '' + search_result.drm = SearchResult.DRM_LOCKED + + for option in idata.xpath('//ul[@class="order_product_options"]/li'): + option_type = option.text.strip() if option.text else '' + if option_type.startswith('Формат:'): + search_result.formats = ''.join(option.xpath('.//b/text()')).strip() + if option_type.startswith('Защита:'): + if ''.join(option.xpath('.//b/text()')).strip() == 'няма': + search_result.drm = SearchResult.DRM_UNLOCKED + + if not search_result.author: + search_result.author = ', '.join(idata.xpath('//div[@class="row product_info"]/div/div/div[@class="item-author"]/a/text()')).strip(', ') return True def _do_search(self, url, max_results, timeout): - br = browser() - with closing(br.open(url, timeout=timeout)) as f: - page = f.read().decode('utf-8') - doc = html.fromstring(page) + doc = browser_get_url(url, timeout) - for data in doc.xpath('//ul[contains(@class,"book_list")]/li'): - if max_results <= 0: - break + for data in doc.xpath('//ul[contains(@class,"book_list")]/li'): + if max_results <= 0: + break - s = SearchResult() - s.detail_item = ''.join(data.xpath('.//a[@class="th"]/@href')).strip() - if not id: - continue + s = SearchResult() + s.detail_item = ''.join(data.xpath('.//a[@class="th"]/@href')).strip() + if not id: + continue - s.cover_url = ''.join(data.xpath('.//a[@class="th"]/img/@data-original')).strip() - s.title = ''.join(data.xpath('.//div[@class="item-title"]/a/text()')).strip() - s.author = ', '.join(data.xpath('.//div[@class="item-author"]/a/text()')).strip(', ') + s.cover_url = ''.join(data.xpath('.//a[@class="th"]/img/@data-original')).strip() + s.title = ''.join(data.xpath('.//div[@class="item-title"]/a/text()')).strip() + s.author = ', '.join(data.xpath('.//div[@class="item-author"]/a/text()')).strip(', ') - price_list = data.xpath('.//div[@class="item-price"]') - for price_item in price_list: - if price_item.text.startswith('е-книга:'): - s.price = ''.join(price_item.xpath('.//span/text()')) - break + price_list = data.xpath('.//div[@class="item-price"]') + for price_item in price_list: + if price_item.text.startswith('е-книга:'): + s.price = ''.join(price_item.xpath('.//span/text()')) + break - s.price = '0.00 лв.' if not s.price and not price_list else s.price - if not s.price: - # no e-book available - continue + s.price = '0.00 лв.' if not s.price and not price_list else s.price + if not s.price: + # no e-book available + continue - max_results -= 1 - yield s + max_results -= 1 + yield s diff --git a/src/calibre/gui2/store/stores/bn_plugin.py b/src/calibre/gui2/store/stores/bn_plugin.py index a8f0a4e5f311..8cd5d9c609b5 100644 --- a/src/calibre/gui2/store/stores/bn_plugin.py +++ b/src/calibre/gui2/store/stores/bn_plugin.py @@ -8,19 +8,14 @@ __docformat__ = 'restructuredtext en' import re -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -32,7 +27,7 @@ def open(self, parent=None, detail_item=None, external=False): url = "http://bn.com" if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -42,42 +37,38 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % (query.decode('utf-8').replace(' ', '-'), quote_plus(query)) - br = browser() - + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - doc = html.fromstring(raw) - for data in doc.xpath('//ol[contains(@class, "result-set")]/li[contains(@class, "result")]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[contains(@class, "image-block")]/a/@href')) - if not id: - continue - - cover_url = '' - cover_id = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@id')) - m = re.search(r"%s'.*?srcUrl: '(?P.*?)'.*?}" % cover_id, raw) - if m: - cover_url = m.group('iurl') - - title = ''.join(data.xpath('descendant::p[@class="title"]//span[@class="name"]//text()')).strip() - if not title: - continue - - author = ', '.join(data.xpath('.//ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()')).strip() - price = ''.join(data.xpath('.//a[contains(@class, "bn-price")]//text()')) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Nook' - - yield s + for data in doc.xpath('//ol[contains(@class, "result-set")]/li[contains(@class, "result")]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[contains(@class, "image-block")]/a/@href')) + if not id: + continue + + cover_url = '' + cover_id = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@id')) + m = re.search(r"%s'.*?srcUrl: '(?P.*?)'.*?}" % cover_id, raw) + if m: + cover_url = m.group('iurl') + + title = ''.join(data.xpath('descendant::p[@class="title"]//span[@class="name"]//text()')).strip() + if not title: + continue + + author = ', '.join(data.xpath('.//ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()')).strip() + price = ''.join(data.xpath('.//a[contains(@class, "bn-price")]//text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Nook' + + yield s diff --git a/src/calibre/gui2/store/stores/bubok_portugal_plugin.py b/src/calibre/gui2/store/stores/bubok_portugal_plugin.py index 603afda8b140..870f731c196c 100644 --- a/src/calibre/gui2/store/stores/bubok_portugal_plugin.py +++ b/src/calibre/gui2/store/stores/bubok_portugal_plugin.py @@ -7,19 +7,14 @@ __copyright__ = '2014, Rafael Vega ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -30,7 +25,7 @@ class BubokPortugalStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'https://www.bubok.pt/tienda' if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -40,38 +35,31 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + quote_plus(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[contains(@class, "libro")]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="url"]/text()')) - - title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) - - author = ''.join(data.xpath('.//div[@class="autor"]/text()')) - - price = ''.join(data.xpath('.//div[@class="precio"]/text()')) - - formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) - - cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) - - counter -= 1 - - s = SearchResult() - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() - s.price = price.strip() - s.drm = SearchResult.DRM_UNLOCKED - s.formats = formats.strip() - s.cover_url = cover.strip() - yield s + for data in doc.xpath('//div[contains(@class, "libro")]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="url"]/text()')) + title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) + author = ''.join(data.xpath('.//div[@class="autor"]/text()')) + price = ''.join(data.xpath('.//div[@class="precio"]/text()')) + formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) + cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) + + counter -= 1 + + s = SearchResult() + s.title = title.strip() + s.author = author.strip() + s.detail_item = id.strip() + s.price = price.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats.strip() + s.cover_url = cover.strip() + yield s def get_details(self, search_result, timeout): return True diff --git a/src/calibre/gui2/store/stores/bubok_publishing_plugin.py b/src/calibre/gui2/store/stores/bubok_publishing_plugin.py index 5d9da5b50dbf..56cc9495992e 100644 --- a/src/calibre/gui2/store/stores/bubok_publishing_plugin.py +++ b/src/calibre/gui2/store/stores/bubok_publishing_plugin.py @@ -7,17 +7,12 @@ __copyright__ = '2014, Rafael Vega ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig @@ -30,7 +25,7 @@ class BubokPublishingStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'https://www.bubok.es/tienda' if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -40,38 +35,31 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.es/resellers/calibre_search/' + quote_plus(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[contains(@class, "libro")]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="url"]/text()')) - - title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) - - author = ''.join(data.xpath('.//div[@class="autor"]/text()')) - - price = ''.join(data.xpath('.//div[@class="precio"]/text()')) - - formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) - - cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) - - counter -= 1 - - s = SearchResult() - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() - s.price = price.strip() - s.drm = SearchResult.DRM_UNLOCKED - s.formats = formats.strip() - s.cover_url = cover.strip() - yield s + for data in doc.xpath('//div[contains(@class, "libro")]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="url"]/text()')) + title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) + author = ''.join(data.xpath('.//div[@class="autor"]/text()')) + price = ''.join(data.xpath('.//div[@class="precio"]/text()')) + formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) + cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) + + counter -= 1 + + s = SearchResult() + s.title = title.strip() + s.author = author.strip() + s.detail_item = id.strip() + s.price = price.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats.strip() + s.cover_url = cover.strip() + yield s def get_details(self, search_result, timeout): return True diff --git a/src/calibre/gui2/store/stores/chitanka_plugin.py b/src/calibre/gui2/store/stores/chitanka_plugin.py index 27da2ef28c8f..e5de03434991 100644 --- a/src/calibre/gui2/store/stores/chitanka_plugin.py +++ b/src/calibre/gui2/store/stores/chitanka_plugin.py @@ -7,18 +7,13 @@ __copyright__ = '2011, Alex Stanev ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote from urllib.error import HTTPError except ImportError: from urllib2 import quote, HTTPError -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig @@ -73,7 +68,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = url + detail_item - open_url(QUrl(url_slash_cleaner(url))) + open_url(url_slash_cleaner(url)) else: detail_url = None if detail_item: @@ -95,28 +90,22 @@ def search(self, query, max_results=10, timeout=60): counter = max_results # search for book title - br = browser() try: - with closing(br.open(url, timeout=timeout)) as f: - f = f.read().decode('utf-8') - doc = html.fromstring(f) - counter = yield from parse_book_page(doc, base_url, counter) + doc = browser_get_url(url, timeout) + counter = yield from parse_book_page(doc, base_url, counter) + if counter <= 0: + return + + # search for author names + for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): + author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href')) + if author_url == '': + continue + + person_doc = browser_get_url(base_url + author_url, timeout) + counter = yield from parse_book_page(person_doc, base_url, counter) if counter <= 0: - return - - # search for author names - for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): - author_url = ''.join(data.xpath('.//a[contains(@href,"/person/")]/@href')) - if author_url == '': - continue - - br2 = browser() - with closing(br2.open(base_url + author_url, timeout=timeout)) as f: - f = f.read().decode('utf-8') - doc = html.fromstring(f) - counter = yield from parse_book_page(doc, base_url, counter) - if counter <= 0: - break + break except HTTPError as e: if e.code == 404: diff --git a/src/calibre/gui2/store/stores/ebook_nl_plugin.py b/src/calibre/gui2/store/stores/ebook_nl_plugin.py index 8a3e053d25cc..c772b594ce96 100644 --- a/src/calibre/gui2/store/stores/ebook_nl_plugin.py +++ b/src/calibre/gui2/store/stores/ebook_nl_plugin.py @@ -7,19 +7,13 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -34,7 +28,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = url_details.format(detail_item) - open_url(QUrl(url)) + open_url(url) else: detail_url = None if detail_item: @@ -45,55 +39,51 @@ def open(self, parent=None, detail_item=None, external=False): d.exec() def search(self, query, max_results=10, timeout=60): - url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + quote(query)) - br = browser() + url = 'http://www.ebook.nl/store/advanced_search_result.php?keywords=' + quote(query) + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): - if counter <= 0: - break - - id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() - if not id: - continue - cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) - title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() - author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() - if author == ' ': - author = '' - price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.drm = SearchResult.DRM_UNKNOWN - s.detail_item = id - - yield s + for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): + if counter <= 0: + break + + id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() + if not id: + continue + cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) + title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() + author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() + if author == ' ': + author = '' + price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.drm = SearchResult.DRM_UNKNOWN + s.detail_item = id + + yield s def get_details(self, search_result, timeout): - br = browser() - with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - formats = [] - if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "ePub")]'): - if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "EPUB3")]'): - formats.append('EPUB3') - else: - formats.append('EPUB') - if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "Pdf")]'): - formats.append('PDF') - search_result.formats = ', '.join(formats) - - if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]' - '//span[@class="ePubAdobeDRM" or @class="ePubwatermerk" or' - ' @class="Pdfwatermark" or @class="PdfAdobeDRM"]'): - search_result.drm = SearchResult.DRM_LOCKED - if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]//span[@class="ePubzonderDRM"]'): - search_result.drm = SearchResult.DRM_UNLOCKED + idata = browser_get_url(search_result.detail_item, timeout) + formats = [] + if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "ePub")]'): + if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "EPUB3")]'): + formats.append('EPUB3') + else: + formats.append('EPUB') + if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]/span[contains(., "Pdf")]'): + formats.append('PDF') + search_result.formats = ', '.join(formats) + + if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]' + '//span[@class="ePubAdobeDRM" or @class="ePubwatermerk" or' + ' @class="Pdfwatermark" or @class="PdfAdobeDRM"]'): + search_result.drm = SearchResult.DRM_LOCKED + if idata.xpath('.//div[@id="book_detail_body"]/ul/li[strong[contains(., "Type")]]//span[@class="ePubzonderDRM"]'): + search_result.drm = SearchResult.DRM_UNLOCKED return True diff --git a/src/calibre/gui2/store/stores/ebookpoint_plugin.py b/src/calibre/gui2/store/stores/ebookpoint_plugin.py index 3a9c557f459e..e8c907fd69a5 100644 --- a/src/calibre/gui2/store/stores/ebookpoint_plugin.py +++ b/src/calibre/gui2/store/stores/ebookpoint_plugin.py @@ -9,19 +9,14 @@ import re from base64 import b64encode -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -50,7 +45,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -61,34 +56,32 @@ def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ul[@class="list"]/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('./a/@href')) - if not id: - continue - - formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) - cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) - title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) - author = ''.join(data.xpath('.//p[@class="author"]//text()')) - price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = re.sub(r'\.',',',price) - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNLOCKED - s.formats = formats.upper() - - yield s + for data in doc.xpath('//ul[@class="list"]/li'): + if counter <= 0: + break + + id = ''.join(data.xpath('./a/@href')) + if not id: + continue + + formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) + cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) + title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) + author = ''.join(data.xpath('.//p[@class="author"]//text()')) + price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = re.sub(r'\.',',',price) + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats.upper() + + yield s diff --git a/src/calibre/gui2/store/stores/ebooks_com_plugin.py b/src/calibre/gui2/store/stores/ebooks_com_plugin.py index f82ff0c902db..852499af6c9c 100644 --- a/src/calibre/gui2/store/stores/ebooks_com_plugin.py +++ b/src/calibre/gui2/store/stores/ebooks_com_plugin.py @@ -8,19 +8,14 @@ __docformat__ = 'restructuredtext en' import re -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -39,7 +34,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = m_url + d_click + detail_item if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) + open_url(url_slash_cleaner(detail_url if detail_url else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url) d.setWindowTitle(self.name) @@ -49,45 +44,43 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.ebooks.com/SearchApp/SearchResults.net?term=' + quote_plus(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@id="results"]//li'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//a[1]/@href')) - mo = re.search(r'\d+', id) - if not mo: - continue - id = mo.group() - - cover_url = ''.join(data.xpath('.//div[contains(@class, "img")]//img/@src')) - - title = ''.join(data.xpath( - 'descendant::span[@class="book-title"]/a/text()')).strip() - author = ', '.join(data.xpath( - 'descendant::span[@class="author"]/a/text()')).strip() - if not title or not author: - continue - - price = ''.join(data.xpath( - './/span[starts-with(text(), "US$") or' - ' starts-with(text(), "€") or starts-with(text(), "CA$") or' - ' starts-with(text(), "AU$") or starts-with(text(), "£")]/text()')).strip() - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = '?url=http://www.ebooks.com/cj.asp?IID=' + id.strip() + '&cjsku=' + id.strip() - - yield s + for data in doc.xpath('//div[@id="results"]//li'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//a[1]/@href')) + mo = re.search(r'\d+', id) + if not mo: + continue + id = mo.group() + + cover_url = ''.join(data.xpath('.//div[contains(@class, "img")]//img/@src')) + + title = ''.join(data.xpath( + 'descendant::span[@class="book-title"]/a/text()')).strip() + author = ', '.join(data.xpath( + 'descendant::span[@class="author"]/a/text()')).strip() + if not title or not author: + continue + + price = ''.join(data.xpath( + './/span[starts-with(text(), "US$") or' + ' starts-with(text(), "€") or starts-with(text(), "CA$") or' + ' starts-with(text(), "AU$") or starts-with(text(), "£")]/text()')).strip() + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = '?url=http://www.ebooks.com/cj.asp?IID=' + id.strip() + '&cjsku=' + id.strip() + + yield s def get_details(self, search_result, timeout): url = 'http://www.ebooks.com/ebooks/book_display.asp?IID=' @@ -98,17 +91,14 @@ def get_details(self, search_result, timeout): if not id: return - br = browser() - with closing(br.open(url + id, timeout=timeout)) as nf: - pdoc = html.fromstring(nf.read()) - - search_result.drm = SearchResult.DRM_UNLOCKED - permissions = ' '.join(pdoc.xpath('//div[@class="permissions-items"]//text()')) - if 'off' in permissions: - search_result.drm = SearchResult.DRM_LOCKED + pdoc = browser_get_url(url + id, timeout) + search_result.drm = SearchResult.DRM_UNLOCKED + permissions = ' '.join(pdoc.xpath('//div[@class="permissions-items"]//text()')) + if 'off' in permissions: + search_result.drm = SearchResult.DRM_LOCKED - fdata = pdoc.xpath('//div[contains(@class, "more-links") and contains(@class, "more-links-info")]/div//span/text()') - if len(fdata) > 1: - search_result.formats = ', '.join(fdata[1:]) + fdata = pdoc.xpath('//div[contains(@class, "more-links") and contains(@class, "more-links-info")]/div//span/text()') + if len(fdata) > 1: + search_result.formats = ', '.join(fdata[1:]) return True diff --git a/src/calibre/gui2/store/stores/ebookshoppe_uk_plugin.py b/src/calibre/gui2/store/stores/ebookshoppe_uk_plugin.py index 9fd8cbebdfe2..8bdddba2b80a 100644 --- a/src/calibre/gui2/store/stores/ebookshoppe_uk_plugin.py +++ b/src/calibre/gui2/store/stores/ebookshoppe_uk_plugin.py @@ -11,15 +11,9 @@ from urllib.parse import quote except ImportError: from urllib2 import quote -from contextlib import closing -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -34,7 +28,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = url_details.format(detail_item) - open_url(QUrl(url)) + open_url(url) else: detail_url = None if detail_item: @@ -46,48 +40,43 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) - br = browser() - br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] + doc = browser_get_url(url, timeout, headers=[("Referer", "http://www.ebookshoppe.com/")]) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ul[@class="ProductList"]/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('./div[@class="ProductDetails"]/' - 'strong/a/@href')).strip() - if not id: - continue - cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) - title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) - price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.price = price - s.drm = SearchResult.DRM_UNLOCKED - s.detail_item = id - - self.get_author_and_formats(s, timeout) - if not s.author: - continue - - yield s + for data in doc.xpath('//ul[@class="ProductList"]/li'): + if counter <= 0: + break + + id = ''.join(data.xpath('./div[@class="ProductDetails"]/' + 'strong/a/@href')).strip() + if not id: + continue + cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) + title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) + price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.price = price + s.drm = SearchResult.DRM_UNLOCKED + s.detail_item = id + + self.get_author_and_formats(s, timeout) + if not s.author: + continue + + yield s def get_author_and_formats(self, search_result, timeout): - br = browser() - with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - author = ''.join(idata.xpath('//div[@id="ProductOtherDetails"]/dl/dd[1]/text()')) - if author: - search_result.author = author - formats = idata.xpath('//dl[@class="ProductAddToCart"]/dd/' - 'ul[@class="ProductOptionList"]/li/label/text()') - if formats: - search_result.formats = ', '.join(formats) - search_result.drm = SearchResult.DRM_UNKNOWN + idata = browser_get_url(search_result.detail_item, timeout) + author = ''.join(idata.xpath('//div[@id="ProductOtherDetails"]/dl/dd[1]/text()')) + if author: + search_result.author = author + formats = idata.xpath('//dl[@class="ProductAddToCart"]/dd/' + 'ul[@class="ProductOptionList"]/li/label/text()') + if formats: + search_result.formats = ', '.join(formats) + search_result.drm = SearchResult.DRM_UNKNOWN return True diff --git a/src/calibre/gui2/store/stores/empik_plugin.py b/src/calibre/gui2/store/stores/empik_plugin.py index 8a2c4c5008a5..1a37c9beacc8 100644 --- a/src/calibre/gui2/store/stores/empik_plugin.py +++ b/src/calibre/gui2/store/stores/empik_plugin.py @@ -8,19 +8,14 @@ __docformat__ = 'restructuredtext en' from base64 import b64encode -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -49,7 +44,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -59,37 +54,33 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'https://www.empik.com/ebooki/ebooki,3501,s?sort=scoreDesc&resultsPP={}&q={}'.format(max_results, quote(query)) - br = browser() - + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="search-content js-search-content"]/div'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="name"]/a/@href')) - if not id: - continue - - cover_url = ''.join(data.xpath('.//a/img[@class="lazy"]/@lazy-img')) - author = ', '.join(data.xpath('.//a[@class="smartAuthor"]/text()')) - title = ''.join(data.xpath('.//div[@class="name"]/a/@title')) - price = ''.join(data.xpath('.//div[@class="price ta-price-tile "]/text()')) - - # with closing(br.open('https://empik.com' + id.strip(), timeout=timeout/4)) as nf: - # idata = html.fromstring(nf.read()) - # crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()') - # formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.split('  - ')[0] - s.author = author.strip() - s.price = price.strip() - s.detail_item = 'https://empik.com' + id.strip() - # s.formats = formats.upper().strip() - - yield s + for data in doc.xpath('//div[@class="search-content js-search-content"]/div'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="name"]/a/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//a/img[@class="lazy"]/@lazy-img')) + author = ', '.join(data.xpath('.//a[@class="smartAuthor"]/text()')) + title = ''.join(data.xpath('.//div[@class="name"]/a/@title')) + price = ''.join(data.xpath('.//div[@class="price ta-price-tile "]/text()')) + + # idata = browser_get_url('https://empik.com' + id.strip(), timeout/4) + # crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()') + # formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.split('  - ')[0] + s.author = author.strip() + s.price = price.strip() + s.detail_item = 'https://empik.com' + id.strip() + # s.formats = formats.upper().strip() + + yield s diff --git a/src/calibre/gui2/store/stores/google_books_plugin.py b/src/calibre/gui2/store/stores/google_books_plugin.py index f81ca040998a..ef5bea7d2611 100644 --- a/src/calibre/gui2/store/stores/google_books_plugin.py +++ b/src/calibre/gui2/store/stores/google_books_plugin.py @@ -7,75 +7,63 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html -from qt.core import QUrl +try: + from html5_parser import parse as parse_html +except ImportError: # Old versions of calibre + import html5lib + def parse_html(raw): + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog -def parse_html(raw): - try: - from html5_parser import parse - except ImportError: - # Old versions of calibre - import html5lib - return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) - else: - return parse(raw) - - def search_google(query, max_results=10, timeout=60, write_html_to=None): url = 'https://www.google.com/search?tbm=bks&q=' + quote_plus(query) - br = browser() - + doc = browser_get_url(url, timeout, html_parser=parse_html) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - doc = parse_html(raw) - if write_html_to is not None: - praw = html.tostring(doc, encoding='utf-8') - open(write_html_to, 'wb').write(praw) - for data in doc.xpath('//div[@id="rso"]/div'): - if counter <= 0: - break - h3 = data.xpath('descendant::h3') - if not h3: - continue - h3 = h3[0] - a = h3.getparent() - id = a.get('href') - if not id: - continue - - title = ''.join(data.xpath('.//h3//text()')).strip() - authors = data.xpath('descendant::a[@class="fl" and @href]//text()') - while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): - authors = authors[:-1] - if not authors: - continue - author = ' & '.join(authors) - - counter -= 1 - - s = SearchResult() - s.title = title.strip() - s.author = author.strip() - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNKNOWN - - yield s + if write_html_to is not None: + praw = html.tostring(doc, encoding='utf-8') + open(write_html_to, 'wb').write(praw) + for data in doc.xpath('//div[@id="rso"]/div'): + if counter <= 0: + break + h3 = data.xpath('descendant::h3') + if not h3: + continue + h3 = h3[0] + a = h3.getparent() + id = a.get('href') + if not id: + continue + + title = ''.join(data.xpath('.//h3//text()')).strip() + authors = data.xpath('descendant::a[@class="fl" and @href]//text()') + while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): + authors = authors[:-1] + if not authors: + continue + author = ' & '.join(authors) + + counter -= 1 + + s = SearchResult() + s.title = title.strip() + s.author = author.strip() + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNKNOWN + + yield s class GoogleBooksStore(BasicStoreConfig, StorePlugin): @@ -83,7 +71,7 @@ class GoogleBooksStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'https://books.google.com/books' if True or external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -95,25 +83,23 @@ def search(self, query, max_results=10, timeout=60): yield result def get_details(self, search_result, timeout): - br = browser() - with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: - doc = parse_html(nf.read()) - - search_result.cover_url = ''.join(doc.xpath('//div[@class="sidebarcover"]//img/@src')) - - # Try to get the set price. - price = ''.join(doc.xpath('//div[@id="gb-get-book-container"]//a/text()')) - if 'read' in price.lower(): - price = 'Unknown' - elif 'free' in price.lower() or not price.strip(): - price = '$0.00' - elif '-' in price: - a, b, price = price.partition(' - ') - search_result.price = price.strip() - - search_result.formats = ', '.join(doc.xpath('//div[contains(@class, "download-panel-div")]//a/text()')).upper() - if not search_result.formats: - search_result.formats = _('Unknown') + doc = browser_get_url(search_result.detail_item, timeout, html_parser=parse_html) + + search_result.cover_url = ''.join(doc.xpath('//div[@class="sidebarcover"]//img/@src')) + + # Try to get the set price. + price = ''.join(doc.xpath('//div[@id="gb-get-book-container"]//a/text()')) + if 'read' in price.lower(): + price = 'Unknown' + elif 'free' in price.lower() or not price.strip(): + price = '$0.00' + elif '-' in price: + a, b, price = price.partition(' - ') + search_result.price = price.strip() + + search_result.formats = ', '.join(doc.xpath('//div[contains(@class, "download-panel-div")]//a/text()')).upper() + if not search_result.formats: + search_result.formats = _('Unknown') return True diff --git a/src/calibre/gui2/store/stores/gutenberg_plugin.py b/src/calibre/gui2/store/stores/gutenberg_plugin.py index 403d214614fc..11cd1317781f 100644 --- a/src/calibre/gui2/store/stores/gutenberg_plugin.py +++ b/src/calibre/gui2/store/stores/gutenberg_plugin.py @@ -11,12 +11,17 @@ except ImportError: from urllib import quote_plus -from html5_parser import parse +try: + from html5_parser import parse as parse_html +except ImportError: # Old versions of calibre + import html5lib + def parse_html(raw): + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + from lxml import etree -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog from css_selectors import Select @@ -33,16 +38,9 @@ def absurl(href): def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(quote_plus(query)) - counter = max_results - br = browser() - raw = br.open(url).read() - - if write_raw_to is not None: - with open(write_raw_to, 'wb') as f: - f.write(raw) - - root = parse(raw) + root = browser_get_url(url, timeout, save_html_to=write_raw_to, html_parser=parse_html) CSSSelect = Select(root) + counter = max_results for li in CSSSelect('li.booklink'): if counter <= 0: break @@ -61,7 +59,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None): break # Get the formats and direct download links. - details_doc = parse(br.open_novisit(s.detail_item).read()) + details_doc = browser_get_url(s.detail_item, timeout, novisit=True, html_parser=parse_html) doc_select = Select(details_doc) for tr in doc_select('table.files tr[typeof="pgterms:file"]'): for a in doc_select('a.link', tr): diff --git a/src/calibre/gui2/store/stores/kobo_plugin.py b/src/calibre/gui2/store/stores/kobo_plugin.py index 272498c11831..9710038051b9 100644 --- a/src/calibre/gui2/store/stores/kobo_plugin.py +++ b/src/calibre/gui2/store/stores/kobo_plugin.py @@ -26,8 +26,8 @@ def read_url(url, timeout=60): # Kobo uses Akamai which has some bot detection that uses network/tls # protocol data. So use the Chromium network stack to make the request - from calibre.scraper.simple import read_url as ru - return ru(read_url.storage, url, timeout=timeout) + from calibre.gui2.store import http_get_url + return http_get_url(read_url.storage, url, timeout=timeout) read_url.storage = [] diff --git a/src/calibre/gui2/store/stores/legimi_plugin.py b/src/calibre/gui2/store/stores/legimi_plugin.py index d71925c036e7..51169264ee2d 100644 --- a/src/calibre/gui2/store/stores/legimi_plugin.py +++ b/src/calibre/gui2/store/stores/legimi_plugin.py @@ -8,17 +8,12 @@ __docformat__ = 'restructuredtext en' from base64 import b64encode -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import browser_get_url, url_slash_cleaner from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig @@ -49,7 +44,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -59,38 +54,34 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'https://www.legimi.pl/ebooki/?sort=score&searchphrase=' + quote_plus(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="book-search row auto-clear"]/div'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="panel-body"]/a/@href')) - if not id: - continue - - cover_url = ''.join(data.xpath('.//div[@class="img-content"]/img/@data-src')) - title = ''.join(data.xpath('.//a[@class="book-title clampBookTitle"]/text()')) - author = ' '.join(data.xpath('.//div[@class="authors-container clampBookAuthors"]/a/text()')) - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.detail_item = 'https://www.legimi.pl' + id.strip() - s.drm = SearchResult.DRM_UNLOCKED - - yield s + for data in doc.xpath('//div[@class="book-search row auto-clear"]/div'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="panel-body"]/a/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//div[@class="img-content"]/img/@data-src')) + title = ''.join(data.xpath('.//a[@class="book-title clampBookTitle"]/text()')) + author = ' '.join(data.xpath('.//div[@class="authors-container clampBookAuthors"]/a/text()')) + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.detail_item = 'https://www.legimi.pl' + id.strip() + s.drm = SearchResult.DRM_UNLOCKED + + yield s def get_details(self, search_result, timeout): - br = browser() - with closing(br.open(search_result.detail_item, timeout=timeout/2)) as nf: - idata = html.fromstring(nf.read()) + idata = browser_get_url(search_result.detail_item, timeout/2) - price = ''.join(idata.xpath('.//section[@class="book-sale-options"]//p[@class="light-text"]/text()')) - search_result.price = price.split('bez abonamentu ')[-1] + price = ''.join(idata.xpath('.//section[@class="book-sale-options"]//p[@class="light-text"]/text()')) + search_result.price = price.split('bez abonamentu ')[-1] return True diff --git a/src/calibre/gui2/store/stores/libri_de_plugin.py b/src/calibre/gui2/store/stores/libri_de_plugin.py index f3cbe7bc4adb..becd236bc49f 100644 --- a/src/calibre/gui2/store/stores/libri_de_plugin.py +++ b/src/calibre/gui2/store/stores/libri_de_plugin.py @@ -7,19 +7,13 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -35,7 +29,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = url_details.format(detail_item) - open_url(QUrl(url)) + open_url(url) else: detail_url = None if detail_item: @@ -47,52 +41,50 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' + quote(query)) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="articlecontainer"]'): - if counter <= 0: - break - id_ = ''.join(data.xpath('.//div[@class="trackArtiId"]/text()')) - if not id_: - continue - details = data.xpath('./div[contains(@class, "articleinfobox")]') - if not details: - continue - details = details[0] - title = ''.join(details.xpath('./div[@class="title"]/a/text()')).strip() - author = ''.join(details.xpath('.//div[@class="author"]/text()')).strip() - if author.startswith('von'): - author = author[4:] - - pdf = details.xpath( - 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())') - epub = details.xpath( - 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())') - mobi = details.xpath( - 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())') - - cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src')) - price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip() - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.drm = SearchResult.DRM_UNKNOWN - s.detail_item = id_ - formats = [] - if epub: - formats.append('ePub') - if pdf: - formats.append('PDF') - if mobi: - formats.append('MOBI') - s.formats = ', '.join(formats) - - yield s + for data in doc.xpath('//div[@class="articlecontainer"]'): + if counter <= 0: + break + id_ = ''.join(data.xpath('.//div[@class="trackArtiId"]/text()')) + if not id_: + continue + details = data.xpath('./div[contains(@class, "articleinfobox")]') + if not details: + continue + details = details[0] + title = ''.join(details.xpath('./div[@class="title"]/a/text()')).strip() + author = ''.join(details.xpath('.//div[@class="author"]/text()')).strip() + if author.startswith('von'): + author = author[4:] + + pdf = details.xpath( + 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())') + epub = details.xpath( + 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())') + mobi = details.xpath( + 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())') + + cover_url = ''.join(data.xpath('.//div[@class="coverimg"]/a/img/@src')) + price = ''.join(data.xpath('.//div[@class="preis"]/text()')).replace('*', '').strip() + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.drm = SearchResult.DRM_UNKNOWN + s.detail_item = id_ + formats = [] + if epub: + formats.append('ePub') + if pdf: + formats.append('PDF') + if mobi: + formats.append('MOBI') + s.formats = ', '.join(formats) + + yield s diff --git a/src/calibre/gui2/store/stores/litres_plugin.py b/src/calibre/gui2/store/stores/litres_plugin.py index 5fb531087055..c4084433c4a1 100644 --- a/src/calibre/gui2/store/stores/litres_plugin.py +++ b/src/calibre/gui2/store/stores/litres_plugin.py @@ -16,7 +16,6 @@ from contextlib import closing from lxml import etree -from qt.core import QUrl from calibre import browser, url_slash_cleaner, prints from calibre.ebooks.chardet import xml_to_unicode @@ -32,7 +31,6 @@ class LitResStore(BasicStoreConfig, StorePlugin): # http://robot.litres.ru/pages/biblio_book/?art=174405 def open(self, parent=None, detail_item=None, external=False): - aff_id = u'?' + _get_affiliate_id() url = self.shop_url + aff_id @@ -43,7 +41,7 @@ def open(self, parent=None, detail_item=None, external=False): u'&art=' + quote(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) + open_url(url_slash_cleaner(detail_url if detail_url else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url) d.setWindowTitle(self.name) @@ -52,29 +50,26 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ - 'search=%s&limit=0,%s' - search_url = search_url % (quote(query), max_results) + 'search=%s&limit=0,%s' % (quote(query), max_results) - counter = max_results br = browser() br.addheaders.append(['Accept-Encoding','gzip']) - with closing(br.open(search_url, timeout=timeout)) as r: - ungzipResponse(r,br) - raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] - - doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) - for data in doc.xpath('//*[local-name() = "fb2-book"]'): - if counter <= 0: - break - counter -= 1 - - try: - sRes = self.create_search_result(data) - except Exception as e: - prints('ERROR: cannot parse search result #%s: %s'%(max_results - counter + 1, e)) - continue - yield sRes + ungzipResponse(r, br) + raw = xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] + doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + + counter = max_results + for data in doc.xpath('//*[local-name() = "fb2-book"]'): + if counter <= 0: + break + counter -= 1 + try: + sRes = self.create_search_result(data) + except Exception as e: + prints('ERROR: cannot parse search result #%s: %s'%(max_results - counter + 1, e)) + continue + yield sRes def get_details(self, search_result, timeout=60): pass diff --git a/src/calibre/gui2/store/stores/manybooks_plugin.py b/src/calibre/gui2/store/stores/manybooks_plugin.py index ec174949d59f..cbe3338e061b 100644 --- a/src/calibre/gui2/store/stores/manybooks_plugin.py +++ b/src/calibre/gui2/store/stores/manybooks_plugin.py @@ -41,66 +41,66 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http:// oquery.count = max_results url = oquery.url() - counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: - raw_data = f.read() - raw_data = raw_data.decode('utf-8', 'replace') - doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) - for data in doc.xpath('//*[local-name() = "entry"]'): - if counter <= 0: - break - - counter -= 1 - - s = SearchResult() - - detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]') - if not detail_links: - continue - detail_link = detail_links[0] - detail_href = detail_link.get('href') - if not detail_href: - continue - - s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html' - # These can have HTML inside of them. We are going to get them again later - # just in case. - s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip() - s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip() - - # Follow the detail link to get the rest of the info. - with closing(br.open(detail_href, timeout=timeout/4)) as df: - ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) - ddata = ddoc.xpath('//*[local-name() = "entry"][1]') - if ddata: - ddata = ddata[0] - - # This is the real title and author info we want. We got - # it previously just in case it's not specified here for some reason. - s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip() - s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip() - if s.author.startswith(','): - s.author = s.author[1:] - if s.author.endswith(','): - s.author = s.author[:-1] - - s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip() - - for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): - type = link.get('type') - href = link.get('href') - if type: - ext = mimetypes.guess_extension(type) - if ext: - ext = ext[1:].upper().strip() - s.downloads[ext] = href - - s.price = '$0.00' - s.drm = SearchResult.DRM_UNLOCKED - s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' - - yield s + raw_data = f.read().decode('utf-8', 'replace') + doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + + counter = max_results + for data in doc.xpath('//*[local-name() = "entry"]'): + if counter <= 0: + break + + counter -= 1 + + s = SearchResult() + + detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]') + if not detail_links: + continue + detail_link = detail_links[0] + detail_href = detail_link.get('href') + if not detail_href: + continue + + s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html' + # These can have HTML inside of them. We are going to get them again later + # just in case. + s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip() + + # Follow the detail link to get the rest of the info. + with closing(br.open(detail_href, timeout=timeout/4)) as df: + ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) + ddata = ddoc.xpath('//*[local-name() = "entry"][1]') + if ddata: + ddata = ddata[0] + + # This is the real title and author info we want. We got + # it previously just in case it's not specified here for some reason. + s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip() + s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip() + if s.author.startswith(','): + s.author = s.author[1:] + if s.author.endswith(','): + s.author = s.author[:-1] + + s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip() + + for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): + type = link.get('type') + href = link.get('href') + if type: + ext = mimetypes.guess_extension(type) + if ext: + ext = ext[1:].upper().strip() + s.downloads[ext] = href + + s.price = '$0.00' + s.drm = SearchResult.DRM_UNLOCKED + s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' + + yield s class ManyBooksStore(BasicStoreConfig, OpenSearchOPDSStore): diff --git a/src/calibre/gui2/store/stores/mills_boon_uk_plugin.py b/src/calibre/gui2/store/stores/mills_boon_uk_plugin.py index 6ab36e07c309..af7a32a2c356 100644 --- a/src/calibre/gui2/store/stores/mills_boon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/mills_boon_uk_plugin.py @@ -7,19 +7,14 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -33,7 +28,7 @@ def open(self, parent=None, detail_item=None, external=False): if external or self.config.get('open_external', False): if detail_item: url = detail_item - open_url(QUrl(url_slash_cleaner(url))) + open_url(url_slash_cleaner(url)) else: if detail_item: detail_url = detail_item @@ -45,36 +40,32 @@ def open(self, parent=None, detail_item=None, external=False): d.exec() def search(self, query, max_results=10, timeout=60): - base_url = 'https://www.millsandboon.co.uk' - url = base_url + '/search.aspx??format=ebook&searchText=' + quote(query) - br = browser() - + url = 'https://www.millsandboon.co.uk/search.aspx?format=ebook&searchText=' + quote(query) + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//article[contains(@class, "group")]'): - if counter <= 0: - break - id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() - if not id_: - continue - - cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) - title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() - author = ''.join(data.xpath('.//a[@class="author"]/text()')) - price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) - format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) - drm = SearchResult.DRM_LOCKED - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = id_ - s.drm = drm - s.formats = format_ - - yield s + for data in doc.xpath('//article[contains(@class, "group")]'): + if counter <= 0: + break + id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() + if not id_: + continue + + cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) + title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() + author = ''.join(data.xpath('.//a[@class="author"]/text()')) + price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) + format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) + drm = SearchResult.DRM_LOCKED + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = id_ + s.drm = drm + s.formats = format_ + + yield s diff --git a/src/calibre/gui2/store/stores/mobileread/cache_update_thread.py b/src/calibre/gui2/store/stores/mobileread/cache_update_thread.py index 4a410e684fd6..2afea0865d7e 100644 --- a/src/calibre/gui2/store/stores/mobileread/cache_update_thread.py +++ b/src/calibre/gui2/store/stores/mobileread/cache_update_thread.py @@ -7,14 +7,11 @@ __docformat__ = 'restructuredtext en' import time -from contextlib import closing from threading import Thread -from lxml import html +from qt.core import pyqtSignal, QObject -from qt.core import (pyqtSignal, QObject) - -from calibre import browser +from calibre.gui2.store import browser_get_url from calibre.gui2.store.search_result import SearchResult @@ -48,22 +45,15 @@ def run(self): self.update_details.emit(_('Downloading book list from MobileRead.')) # Download the book list HTML file from MobileRead. - br = browser() - raw_data = None - try: - with closing(br.open(url, timeout=self.timeout)) as f: - raw_data = f.read() - except: - return + data = browser_get_url(url, timeout) - if not raw_data or not self._run: + if not data or not self._run: return self.update_details.emit(_('Processing books.')) # Turn books listed in the HTML file into SearchResults's. books = [] try: - data = html.fromstring(raw_data) raw_books = data.xpath('//ul/li') self.total_changed.emit(len(raw_books)) diff --git a/src/calibre/gui2/store/stores/mobileread/mobileread_plugin.py b/src/calibre/gui2/store/stores/mobileread/mobileread_plugin.py index 470239ecdd78..267179840fbb 100644 --- a/src/calibre/gui2/store/stores/mobileread/mobileread_plugin.py +++ b/src/calibre/gui2/store/stores/mobileread/mobileread_plugin.py @@ -9,7 +9,7 @@ import os from threading import Lock -from qt.core import (QUrl, QCoreApplication) +from qt.core import QCoreApplication from calibre.constants import cache_dir from calibre.gui2 import open_url @@ -43,7 +43,7 @@ def open(self, parent=None, detail_item=None, external=False): url = 'https://www.mobileread.com/' if external or self.config.get('open_external', False): - open_url(QUrl(detail_item if detail_item else url)) + open_url(detail_item if detail_item else url) else: if detail_item: d = WebStoreDialog(self.gui, url, parent, detail_item) diff --git a/src/calibre/gui2/store/stores/nexto_plugin.py b/src/calibre/gui2/store/stores/nexto_plugin.py index 379e1689ff96..de784ec048f1 100644 --- a/src/calibre/gui2/store/stores/nexto_plugin.py +++ b/src/calibre/gui2/store/stores/nexto_plugin.py @@ -8,20 +8,15 @@ __docformat__ = 'restructuredtext en' import re -from contextlib import closing from base64 import standard_b64encode try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -53,7 +48,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64('http://www.nexto.pl/rf/pr?p=' + book_id) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -63,45 +58,41 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + quote_plus(query) + '&scid=1015' - br = browser() - offset=0 - + offset = 0 counter = max_results - while counter: - with closing(br.open(url + '&_offset={}'.format(offset), timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//ul[@class="productslist"]/li'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href')) - if not id: - continue - - price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) - - cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) - cover_url = re.sub(r'%2F', '/', cover_url) - cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) - title = ''.join(data.xpath('.//a[@class="title"]/text()')) - title = re.sub(r' – ebook', '', title) - author = ', '.join(data.xpath('.//div[@class="col-7"]//h4//a/text()')) - formats = ', '.join(data.xpath('.//ul[@class="formats"]/li//b/text()')) - DrmFree = data.xpath('.//ul[@class="formats"]/li//b[contains(@title, "znak")]') - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url if cover_url[:4] == 'http' else 'http://www.nexto.pl' + cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED - s.formats = formats.upper().strip() - - yield s - if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'): + doc = browser_get_url(url + '&_offset={}'.format(offset), timeout) + for data in doc.xpath('//ul[@class="productslist"]/li'): + if counter <= 0: break - offset+=10 + + id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href')) + if not id: + continue + + price = ''.join(data.xpath('.//strong[@class="nprice"]/text()')) + + cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) + cover_url = re.sub(r'%2F', '/', cover_url) + cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) + title = ''.join(data.xpath('.//a[@class="title"]/text()')) + title = re.sub(r' – ebook', '', title) + author = ', '.join(data.xpath('.//div[@class="col-7"]//h4//a/text()')) + formats = ', '.join(data.xpath('.//ul[@class="formats"]/li//b/text()')) + DrmFree = data.xpath('.//ul[@class="formats"]/li//b[contains(@title, "znak")]') + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url if cover_url[:4] == 'http' else 'http://www.nexto.pl' + cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED + s.formats = formats.upper().strip() + + yield s + if not doc.xpath('//div[@class="listnavigator"]//a[@class="next"]'): + break + offset += 10 diff --git a/src/calibre/gui2/store/stores/ozon_ru_plugin.py b/src/calibre/gui2/store/stores/ozon_ru_plugin.py index 57fc4123fc48..19fddf7a8c1f 100644 --- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py +++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py @@ -13,7 +13,12 @@ except ImportError: from urllib import quote_plus -from qt.core import QUrl +try: + from html5_parser import parse as parse_html +except ImportError: # Old versions of calibre + import html5lib + def parse_html(raw): + return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) from calibre import browser, url_slash_cleaner from calibre.ebooks.chardet import xml_to_unicode @@ -25,39 +30,28 @@ shop_url = 'http://www.ozon.ru' -def parse_html(raw): - try: - from html5_parser import parse - except ImportError: - # Old versions of calibre - import html5lib - return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) - else: - return parse(raw) - - def search(query, max_results=15, timeout=60): url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % quote_plus(query) - counter = max_results br = browser() - with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] - root = parse_html(raw) - for tile in root.xpath('//*[@class="bShelfTile inline"]'): - if counter <= 0: - break - counter -= 1 - - s = SearchResult(store_name='OZON.ru') - s.detail_item = shop_url + tile.xpath('descendant::a[@class="eShelfTile_Link"]/@href')[0] - s.title = tile.xpath('descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0] - s.author = tile.xpath('descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0] - s.price = ''.join(tile.xpath('descendant::div[contains(@class, "eShelfTile_Price")]/text()')) - s.cover_url = 'http:' + tile.xpath('descendant::img/@data-original')[0] - s.price = format_price_in_RUR(s.price) - yield s + root = parse_html(raw) + + counter = max_results + for tile in root.xpath('//*[@class="bShelfTile inline"]'): + if counter <= 0: + break + counter -= 1 + + s = SearchResult(store_name='OZON.ru') + s.detail_item = shop_url + tile.xpath('descendant::a[@class="eShelfTile_Link"]/@href')[0] + s.title = tile.xpath('descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0] + s.author = tile.xpath('descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0] + s.price = ''.join(tile.xpath('descendant::div[contains(@class, "eShelfTile_Price")]/text()')) + s.cover_url = 'http:' + tile.xpath('descendant::img/@data-original')[0] + s.price = format_price_in_RUR(s.price) + yield s class OzonRUStore(StorePlugin): @@ -65,7 +59,7 @@ class OzonRUStore(StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = detail_item or shop_url if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(url))) + open_url(url_slash_cleaner(url)) else: d = WebStoreDialog(self.gui, shop_url, parent, url) d.setWindowTitle(self.name) diff --git a/src/calibre/gui2/store/stores/publio_plugin.py b/src/calibre/gui2/store/stores/publio_plugin.py index b351590ae46f..71b27dc718ac 100644 --- a/src/calibre/gui2/store/stores/publio_plugin.py +++ b/src/calibre/gui2/store/stores/publio_plugin.py @@ -12,15 +12,10 @@ except ImportError: from urllib import quote from base64 import b64encode -from contextlib import closing -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -48,7 +43,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -56,39 +51,36 @@ def open(self, parent=None, detail_item=None, external=False): d.exec() def search(self, query, max_results=20, timeout=60): - - br = browser() - counter = max_results page = 1 while counter: - with closing(br.open('http://www.publio.pl/e-booki,strona{}.html?q={}'.format(page, quote(query)), timeout=timeout)) as f: # noqa - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="products-list"]//div[@class="product-tile"]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//a[@class="product-tile-cover"]/@href')) - if not id: - continue - - cover_url = ''.join(data.xpath('.//img[@class="product-tile-cover-photo"]/@src')) - title = ''.join(data.xpath('.//span[@class="product-tile-title-long"]/text()')) - author = ', '.join(data.xpath('.//span[@class="product-tile-author"]/a/text()')) - price = ''.join(data.xpath('.//div[@class="product-tile-price-wrapper "]/a/ins/text()')) - formats = ''.join(data.xpath('.//a[@class="product-tile-cover"]/img/@alt')).split(' - ebook ')[1] - - counter -= 1 - - s = SearchResult() - s.cover_url = 'http://www.publio.pl' + cover_url - s.title = title.strip() - s.author = author - s.price = price - s.detail_item = 'http://www.publio.pl' + id.strip() - s.formats = formats.upper().strip() - - yield s - if not doc.xpath('boolean(//a[@class="next"])'): + url = 'http://www.publio.pl/e-booki,strona{}.html?q={}'.format(page, quote(query)) + doc = browser_get_url(url, timeout) + for data in doc.xpath('//div[@class="products-list"]//div[@class="product-tile"]'): + if counter <= 0: break - page+=1 + + id = ''.join(data.xpath('.//a[@class="product-tile-cover"]/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//img[@class="product-tile-cover-photo"]/@src')) + title = ''.join(data.xpath('.//span[@class="product-tile-title-long"]/text()')) + author = ', '.join(data.xpath('.//span[@class="product-tile-author"]/a/text()')) + price = ''.join(data.xpath('.//div[@class="product-tile-price-wrapper "]/a/ins/text()')) + formats = ''.join(data.xpath('.//a[@class="product-tile-cover"]/img/@alt')).split(' - ebook ')[1] + + counter -= 1 + + s = SearchResult() + s.cover_url = 'http://www.publio.pl' + cover_url + s.title = title.strip() + s.author = author + s.price = price + s.detail_item = 'http://www.publio.pl' + id.strip() + s.formats = formats.upper().strip() + + yield s + if not doc.xpath('boolean(//a[@class="next"])'): + break + page += 1 diff --git a/src/calibre/gui2/store/stores/rw2010_plugin.py b/src/calibre/gui2/store/stores/rw2010_plugin.py index ae9173ee303b..aceff659362d 100644 --- a/src/calibre/gui2/store/stores/rw2010_plugin.py +++ b/src/calibre/gui2/store/stores/rw2010_plugin.py @@ -8,19 +8,14 @@ __docformat__ = 'restructuredtext en' import re -from contextlib import closing try: from urllib.parse import urlencode except ImportError: from urllib import urlencode -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -32,7 +27,7 @@ def open(self, parent=None, detail_item=None, external=False): url = 'http://www.rw2010.pl/' if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -41,42 +36,38 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://www.rw2010.pl/go.live.php/?launch_macro=catalogue-search-rd' - values={ + values = { 'fkeyword': query, - 'file_type':'' - } - - br = browser() + 'file_type': '' + } + doc = browser_get_url(url, timeout, data=urlencode(values)) counter = max_results - with closing(br.open(url, data=urlencode(values), timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="ProductDetail"]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="img"]/a/@href')) - if not id: - continue - - with closing(br.open(id.strip(), timeout=timeout/4)) as nf: - idata = html.fromstring(nf.read()) - cover_url = ''.join(idata.xpath('//div[@class="boxa"]//div[@class="img"]/img/@src')) - author = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Autor: "]/span/text()')) - title = ''.join(idata.xpath('//div[@class="boxb"]/h2[1]/text()')) - title = re.sub(r'\(#.+\)', '', title) - formats = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Format pliku: "]/span/text()')) - price = ''.join(idata.xpath('//div[@class="price-box"]/span/text()')) + ',00 zł' - - counter -= 1 - - s = SearchResult() - s.cover_url = 'http://www.rw2010.pl/' + cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = re.sub(r'%3D', '=', id) - s.drm = SearchResult.DRM_UNLOCKED - s.formats = formats[0:-2].upper() - - yield s + for data in doc.xpath('//div[@class="ProductDetail"]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="img"]/a/@href')) + if not id: + continue + + iadata = browser_get_url(id.strip(), timeout/4) + cover_url = ''.join(idata.xpath('//div[@class="boxa"]//div[@class="img"]/img/@src')) + author = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Autor: "]/span/text()')) + title = ''.join(idata.xpath('//div[@class="boxb"]/h2[1]/text()')) + title = re.sub(r'\(#.+\)', '', title) + formats = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Format pliku: "]/span/text()')) + price = ''.join(idata.xpath('//div[@class="price-box"]/span/text()')) + ',00 zł' + + counter -= 1 + + s = SearchResult() + s.cover_url = 'http://www.rw2010.pl/' + cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = re.sub(r'%3D', '=', id) + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats[0:-2].upper() + + yield s diff --git a/src/calibre/gui2/store/stores/smashwords_plugin.py b/src/calibre/gui2/store/stores/smashwords_plugin.py index 4c0da7feeae7..79fb34737869 100644 --- a/src/calibre/gui2/store/stores/smashwords_plugin.py +++ b/src/calibre/gui2/store/stores/smashwords_plugin.py @@ -9,19 +9,14 @@ import random import re -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -36,49 +31,44 @@ def search(query, max_results=10, timeout=60, save_raw=None): except AttributeError: pass # old version of mechanize + doc = browser_get_url(url, timeout, browser=br, save_html_to=save_raw) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - raw = f.read() - if save_raw: - with open(save_raw, 'wb') as r: - r.write(raw) - doc = html.fromstring(raw) - for data in doc.xpath('//div[@id="pageContent"]//div[contains(@class, "library-book")]'): - if counter <= 0: - break - data = html.fromstring(html.tostring(data)) - - id_a = ''.join(data.xpath('//span[contains(@class, "library-title")]/a/@href')) - if not id_a: - continue - - cover_url = ''.join(data.xpath('//img[contains(@class, "book-list-image")]/@src')) - - title = ''.join(data.xpath('.//span[contains(@class, "library-title")]//text()')) - author = ''.join(data.xpath('.//span[contains(@class, "library-by-line")]/a//text()')) - - price = ''.join(data.xpath('.//div[@class="subnote"]//text()')) - if 'Price:' in price: - try: - price = price.partition('Price:')[2] - price = re.sub(r'\s', ' ', price).strip() - price = price.split(' ')[0].strip() - except Exception: - price = 'Unknown' - if price == 'Free!': - price = '$0.00' - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = id_a - s.drm = SearchResult.DRM_UNLOCKED - - yield s + for data in doc.xpath('//div[@id="pageContent"]//div[contains(@class, "library-book")]'): + if counter <= 0: + break + data = html.fromstring(html.tostring(data)) + + id_a = ''.join(data.xpath('//span[contains(@class, "library-title")]/a/@href')) + if not id_a: + continue + + cover_url = ''.join(data.xpath('//img[contains(@class, "book-list-image")]/@src')) + + title = ''.join(data.xpath('.//span[contains(@class, "library-title")]//text()')) + author = ''.join(data.xpath('.//span[contains(@class, "library-by-line")]/a//text()')) + + price = ''.join(data.xpath('.//div[@class="subnote"]//text()')) + if 'Price:' in price: + try: + price = price.partition('Price:')[2] + price = re.sub(r'\s', ' ', price).strip() + price = price.split(' ')[0].strip() + except Exception: + price = 'Unknown' + if price == 'Free!': + price = '$0.00' + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = id_a + s.drm = SearchResult.DRM_UNLOCKED + + yield s class SmashwordsStore(BasicStoreConfig, StorePlugin): @@ -97,7 +87,7 @@ def open(self, parent=None, detail_item=None, external=False): url = url + aff_id if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) + open_url(url_slash_cleaner(detail_url if detail_url else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url) d.setWindowTitle(self.name) @@ -110,11 +100,8 @@ def search(self, query, max_results=10, timeout=60): def get_details(self, search_result, timeout): url = 'https://www.smashwords.com/' - - br = browser() - with closing(br.open(url + search_result.detail_item, timeout=timeout)) as nf: - idata = html.fromstring(nf.read()) - search_result.formats = ', '.join(list(set(idata.xpath('//p//abbr//text()')))) + idata = browser_get_url(url + search_result.detail_item, timeout) + search_result.formats = ', '.join(list(set(idata.xpath('//p//abbr//text()')))) return True diff --git a/src/calibre/gui2/store/stores/swiatebookow_plugin.py b/src/calibre/gui2/store/stores/swiatebookow_plugin.py index 2e863a36ad16..35b01d486051 100644 --- a/src/calibre/gui2/store/stores/swiatebookow_plugin.py +++ b/src/calibre/gui2/store/stores/swiatebookow_plugin.py @@ -8,19 +8,14 @@ __docformat__ = 'restructuredtext en' from base64 import b64encode -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -49,7 +44,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) @@ -57,39 +52,36 @@ def open(self, parent=None, detail_item=None, external=False): d.exec() def search(self, query, max_results=10, timeout=60): - - br = browser() - page=1 - + page = 1 counter = max_results while counter: - with closing(br.open('https://www.swiatebookow.pl/ebooki/?q=' + quote(query) + '&page={}'.format(page), timeout=timeout)) as f: - doc = html.fromstring(f.read().decode('utf-8')) - for data in doc.xpath('//div[@class="category-item-container"]//div[@class="book-large"]'): - if counter <= 0: - break - - id = ''.join(data.xpath('./a/@href')) - if not id: - continue - - cover_url = ''.join(data.xpath('.//div[@class="cover-xs"]//img/@data-src')) - price = ''.join(data.xpath('.//span[@class="item-price"]/text()')+data.xpath('.//span[@class="sub-price"]/text()')) - title = ''.join(data.xpath('.//div[@class="largebox-book-info"]//h2/a/text()')) - author = ', '.join(data.xpath('.//div[@class="largebox-book-info"]/p/a/text()')) - - counter -= 1 - - s = SearchResult() - s.cover_url = 'https://www.swiatebookow.pl' + cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price - s.detail_item = 'https://www.swiatebookow.pl' + id - # s.formats = formats.upper() - s.drm = SearchResult.DRM_UNLOCKED - - yield s - if not doc.xpath('//div[@class="paging_bootstrap pagination"]//a[@class="next"]'): + url = 'https://www.swiatebookow.pl/ebooki/?q=' + quote(query) + '&page={}'.format(page) + doc = browser_get_url(url, timeout) + for data in doc.xpath('//div[@class="category-item-container"]//div[@class="book-large"]'): + if counter <= 0: break - page+=1 + + id = ''.join(data.xpath('./a/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//div[@class="cover-xs"]//img/@data-src')) + price = ''.join(data.xpath('.//span[@class="item-price"]/text()')+data.xpath('.//span[@class="sub-price"]/text()')) + title = ''.join(data.xpath('.//div[@class="largebox-book-info"]//h2/a/text()')) + author = ', '.join(data.xpath('.//div[@class="largebox-book-info"]/p/a/text()')) + + counter -= 1 + + s = SearchResult() + s.cover_url = 'https://www.swiatebookow.pl' + cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price + s.detail_item = 'https://www.swiatebookow.pl' + id + # s.formats = formats.upper() + s.drm = SearchResult.DRM_UNLOCKED + + yield s + if not doc.xpath('//div[@class="paging_bootstrap pagination"]//a[@class="next"]'): + break + page += 1 diff --git a/src/calibre/gui2/store/stores/virtualo_plugin.py b/src/calibre/gui2/store/stores/virtualo_plugin.py index ba89460bf4ce..dfcf5711dfa6 100644 --- a/src/calibre/gui2/store/stores/virtualo_plugin.py +++ b/src/calibre/gui2/store/stores/virtualo_plugin.py @@ -9,19 +9,14 @@ import re from base64 import b64encode -from contextlib import closing try: from urllib.parse import quote except ImportError: from urllib import quote -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -50,7 +45,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -58,38 +53,35 @@ def open(self, parent=None, detail_item=None, external=False): d.exec() def search(self, query, max_results=12, timeout=60): - url = 'http://virtualo.pl/?q=' + quote(query) - - br = browser() no_drm_pattern = re.compile(r'Watermark|brak') + url = 'http://virtualo.pl/?q=' + quote(query) + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//div[@class="products-list-wrapper"]//li[@class="product "]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="cover-wrapper"]//a/@href')).split(r'?q=')[0] - if not id: - continue - - price = ''.join(data.xpath('.//div[@class="information"]//div[@class="price"]/text()')) - cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) - title = ''.join(data.xpath('.//h3[@class="title"]/a//text()')) - author = ', '.join(data.xpath('.//div[@class="information"]//div[@class="authors"]/a//text()')) - formats = [form.strip() for form in data.xpath('.//div[@class="text-wrapper"]//div[@class="format"]/span[@class="prompt_preview"]/text()')] - nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@class="protection"]/text()'))) - - counter -= 1 - - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = re.sub(r'\.',',',price.strip()) - s.detail_item = id - s.formats = ', '.join(list(filter(None, formats))).upper() - s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED - - yield s + for data in doc.xpath('//div[@class="products-list-wrapper"]//li[@class="product "]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="cover-wrapper"]//a/@href')).split(r'?q=')[0] + if not id: + continue + + price = ''.join(data.xpath('.//div[@class="information"]//div[@class="price"]/text()')) + cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) + title = ''.join(data.xpath('.//h3[@class="title"]/a//text()')) + author = ', '.join(data.xpath('.//div[@class="information"]//div[@class="authors"]/a//text()')) + formats = [form.strip() for form in data.xpath('.//div[@class="text-wrapper"]//div[@class="format"]/span[@class="prompt_preview"]/text()')] + nodrm = no_drm_pattern.search(''.join(data.xpath('.//div[@class="protection"]/text()'))) + + counter -= 1 + + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = re.sub(r'\.',',',price.strip()) + s.detail_item = id + s.formats = ', '.join(list(filter(None, formats))).upper() + s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED + + yield s diff --git a/src/calibre/gui2/store/stores/weightless_books_plugin.py b/src/calibre/gui2/store/stores/weightless_books_plugin.py index 55adb42b7c10..561a1fc52314 100644 --- a/src/calibre/gui2/store/stores/weightless_books_plugin.py +++ b/src/calibre/gui2/store/stores/weightless_books_plugin.py @@ -7,19 +7,14 @@ __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -31,7 +26,7 @@ def open(self, parent=None, detail_item=None, external=False): url = 'http://weightlessbooks.com/' if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_item if detail_item else url))) + open_url(url_slash_cleaner(detail_item if detail_item else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_item) d.setWindowTitle(self.name) @@ -41,41 +36,39 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'http://weightlessbooks.com/?s=' + quote_plus(query) - br = browser() + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//li[@class="product"]'): - if counter <= 0: - break + for data in doc.xpath('//li[@class="product"]'): + if counter <= 0: + break - id = ''.join(data.xpath('.//div[@class="cover"]/a/@href')) - if not id: - continue + id = ''.join(data.xpath('.//div[@class="cover"]/a/@href')) + if not id: + continue - cover_url = ''.join(data.xpath('.//div[@class="cover"]/a/img/@src')) + cover_url = ''.join(data.xpath('.//div[@class="cover"]/a/img/@src')) - price = ''.join(data.xpath('.//div[@class="buy_buttons"]/b[1]/text()')) - if not price: - continue + price = ''.join(data.xpath('.//div[@class="buy_buttons"]/b[1]/text()')) + if not price: + continue - formats = ', '.join(data.xpath('.//select[@class="eStore_variation"]//option//text()')) - formats = formats.upper() + formats = ', '.join(data.xpath('.//select[@class="eStore_variation"]//option//text()')) + formats = formats.upper() - title = ''.join(data.xpath('.//h3/a/text()')) - author = ''.join(data.xpath('.//h3//text()')) - author = author.replace(title, '') + title = ''.join(data.xpath('.//h3/a/text()')) + author = ''.join(data.xpath('.//h3//text()')) + author = author.replace(title, '') - counter -= 1 + counter -= 1 - s = SearchResult() - s.cover_url = cover_url - s.title = title.strip() - s.author = author.strip() - s.price = price.strip() - s.detail_item = id.strip() - s.drm = SearchResult.DRM_UNLOCKED - s.formats = formats + s = SearchResult() + s.cover_url = cover_url + s.title = title.strip() + s.author = author.strip() + s.price = price.strip() + s.detail_item = id.strip() + s.drm = SearchResult.DRM_UNLOCKED + s.formats = formats - yield s + yield s diff --git a/src/calibre/gui2/store/stores/woblink_plugin.py b/src/calibre/gui2/store/stores/woblink_plugin.py index 50aff52a54e5..c1b1b2bfcba8 100644 --- a/src/calibre/gui2/store/stores/woblink_plugin.py +++ b/src/calibre/gui2/store/stores/woblink_plugin.py @@ -13,11 +13,10 @@ except ImportError: from urllib import urlencode, quote_plus +from contextlib import closing from lxml import html from mechanize import Request -from qt.core import QUrl - from calibre import url_slash_cleaner, browser from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin @@ -53,9 +52,8 @@ def search(query, max_results=10, timeout=60): 'nw_filtry_filtr_zakrescen_formularz[min]':'0', 'nw_filtry_filtr_zakrescen_formularz[max]':'350', })) - r = br.open(rq) - raw = r.read() - doc = html.fromstring('' + raw.decode('utf-8') + '') + with closing(br.open(rq)) as web_page: + doc = html.fromstring('' + web_page.read().decode('utf-8') + '') counter = max_results for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka ebook " or @class="nw_katalog_lista_ksiazka ebook promocja"]'): @@ -98,7 +96,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = aff_root + as_base64(detail_item) if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else aff_url))) + open_url(url_slash_cleaner(detail_url if detail_url else aff_url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url if detail_url else aff_url) d.setWindowTitle(self.name) diff --git a/src/calibre/gui2/store/stores/wolnelektury_plugin.py b/src/calibre/gui2/store/stores/wolnelektury_plugin.py index cae75f1ff1e6..7b65e00c358e 100644 --- a/src/calibre/gui2/store/stores/wolnelektury_plugin.py +++ b/src/calibre/gui2/store/stores/wolnelektury_plugin.py @@ -7,19 +7,14 @@ __copyright__ = '2012-2014, Tomasz Długosz ' __docformat__ = 'restructuredtext en' -from contextlib import closing try: from urllib.parse import quote_plus except ImportError: from urllib import quote_plus -from lxml import html - -from qt.core import QUrl - -from calibre import browser, url_slash_cleaner +from calibre import url_slash_cleaner from calibre.gui2 import open_url -from calibre.gui2.store import StorePlugin +from calibre.gui2.store import browser_get_url, StorePlugin from calibre.gui2.store.basic_config import BasicStoreConfig from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog @@ -36,7 +31,7 @@ def open(self, parent=None, detail_item=None, external=False): detail_url = detail_item if external or self.config.get('open_external', False): - open_url(QUrl(url_slash_cleaner(detail_url if detail_url else url))) + open_url(url_slash_cleaner(detail_url if detail_url else url)) else: d = WebStoreDialog(self.gui, url, parent, detail_url) d.setWindowTitle(self.name) @@ -46,37 +41,34 @@ def open(self, parent=None, detail_item=None, external=False): def search(self, query, max_results=10, timeout=60): url = 'https://wolnelektury.pl/szukaj?q=' + quote_plus(query) - br = browser() - + doc = browser_get_url(url, timeout) counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read()) - for data in doc.xpath('//li[@class="Book-item"]'): - if counter <= 0: - break - - id = ''.join(data.xpath('.//div[@class="title"]/a/@href')) - if not id: - continue - - cover_url = ''.join(data.xpath('.//div[@class="cover-area"]//img/@src')) - title = ''.join(data.xpath('.//div[@class="title"]/a[1]/text()')) - author = ', '.join(data.xpath('.//div[@class="author"]/a/text()')) - price = '0,00 zł' - - counter -= 1 - - s = SearchResult() - for link in data.xpath('.//div[@class="book-box-formats"]/span/a'): - ext = ''.join(link.xpath('./text()')) - href = 'https://wolnelektury.pl' + link.get('href') - s.downloads[ext] = href - s.cover_url = 'https://wolnelektury.pl' + cover_url.strip() - s.title = title.strip() - s.author = author - s.price = price - s.detail_item = 'https://wolnelektury.pl' + id - s.formats = ', '.join(s.downloads.keys()) - s.drm = SearchResult.DRM_UNLOCKED - - yield s + for data in doc.xpath('//li[@class="Book-item"]'): + if counter <= 0: + break + + id = ''.join(data.xpath('.//div[@class="title"]/a/@href')) + if not id: + continue + + cover_url = ''.join(data.xpath('.//div[@class="cover-area"]//img/@src')) + title = ''.join(data.xpath('.//div[@class="title"]/a[1]/text()')) + author = ', '.join(data.xpath('.//div[@class="author"]/a/text()')) + price = '0,00 zł' + + counter -= 1 + + s = SearchResult() + for link in data.xpath('.//div[@class="book-box-formats"]/span/a'): + ext = ''.join(link.xpath('./text()')) + href = 'https://wolnelektury.pl' + link.get('href') + s.downloads[ext] = href + s.cover_url = 'https://wolnelektury.pl' + cover_url.strip() + s.title = title.strip() + s.author = author + s.price = price + s.detail_item = 'https://wolnelektury.pl' + id + s.formats = ', '.join(s.downloads.keys()) + s.drm = SearchResult.DRM_UNLOCKED + + yield s diff --git a/src/calibre/scraper/simple_backend.py b/src/calibre/scraper/simple_backend.py index 025f6571b98c..93a91b034837 100644 --- a/src/calibre/scraper/simple_backend.py +++ b/src/calibre/scraper/simple_backend.py @@ -132,7 +132,7 @@ def fetch(self, url_or_qurl, timeout=60): if ans is None: eurl = fetching_url.toString() if self.current_fetch['working']: - raise TimeoutError(f'Timed out loading HTML from: {eurl}') + raise TimeoutError(f'Timed out loading HTML from: {eurl} - {timeout}s elapsed') raise ValueError(f'Failed to load HTML from: {eurl}') return ans finally: diff --git a/src/calibre/utils/opensearch/description.py b/src/calibre/utils/opensearch/description.py index 059694d0f25c..a31208a44873 100644 --- a/src/calibre/utils/opensearch/description.py +++ b/src/calibre/utils/opensearch/description.py @@ -7,7 +7,8 @@ from contextlib import closing -from calibre import browser +from calibre import browser, prints +from calibre.constants import DEBUG from calibre.utils.xml_parse import safe_xml_fromstring from calibre.utils.opensearch.url import URL @@ -32,8 +33,13 @@ def load(self, url): you'll probably just want to pass a URL into the constructor. ''' br = browser() - with closing(br.open(url, timeout=15)) as f: - doc = safe_xml_fromstring(f.read()) + try: + with closing(br.open(url, timeout=15)) as f: + doc = safe_xml_fromstring(f.read()) + except: + if DEBUG: + prints(f'While loading OpenSearch description, could not fetch URL {url}') + raise # version 1.1 has repeating Url elements. self.urls = []