diff --git a/constants/bad_data.php b/constants/bad_data.php index 7ad055c57a..c078536927 100644 --- a/constants/bad_data.php +++ b/constants/bad_data.php @@ -5,56 +5,56 @@ // Some data we get from outside sources is bad or at least mis-defined // Use lower case for all of these, and then compare to a lower cased version const HAS_NO_VOLUME = [ - 'zookeys', - 'studia hibernica', + 'american museum novitates', 'analecta hibernica', + 'balcanica', 'british art studies', - 'der spiegel', - 'international astronomical union circular', - 'yale french studies', - 'capjournal', 'cap journal', - 'phytokeys', - 'starinar', - 'balcanica', - 'american museum novitates', + 'capjournal', + 'der spiegel', 'european journal of taxonomy', + 'international astronomical union circular', 'international journal of the sociology of language', 'mycokeys', 'past & present', 'past and present', + 'phytokeys', + 'starinar', + 'studia hibernica', + 'yale french studies', + 'zookeys', ]; // Some journals have issues only, no volume numbers // oceanic linguistics special publications has the problem that issues will not show up within temlpates const HAS_NO_ISSUE = [ - 'special papers in palaeontology', - 'oceanic linguistics special publications', + 'archaeological reports', + 'ars orientalis', 'cahiers balkaniques', + 'oceanic linguistics special publications', 'res historica', - 'archaeological reports', - "cahiers d'extrême-asie", + 'special papers in palaeontology', "bulletin de l'ecole française d'extrême-orient", - 'ars orientalis', + "cahiers d'extrême-asie", ]; // Some journals have volumes only, no issue numbers const PREFER_VOLUMES = ['illinois classical studies']; // When issue=volume, drop issue. JSTOR calls volumes issues const PREFER_ISSUES = [ - 'mammalian species', 'bulletin of the united states national museum', + 'mammalian species', ]; const CONFERENCE_LIST = [ - 'Proceedings of Advancing Astrophysics', - 'International Cosmic Ray Conference', + ' of the Association for ', 'Annual Meeting of the Association', + 'International Cosmic Ray Conference', + 'Proceedings of Advancing Astrophysics', 'Proceedings of the ', - ' of the Association for ', ]; const BAD_ACCEPTED_MANUSCRIPT_TITLES = [ + '[placeholder]', + 'article not found', 'oup accepted manuscript', + 'placeholder for arabic language transliteration', 'placeholder for bad pdf file', 'placeholder', 'symbolic placeholder', - '[placeholder]', - 'placeholder for arabic language transliteration', - 'article not found', ]; const BAD_AUTHORS = [ '- -.', @@ -158,140 +158,140 @@ ]; const ARE_WORKS = [ - 'medrxiv', 'ietf datatracker', + 'medrxiv', ]; // Things with dois that should be {{cite document|work=THIS}} const PUBLISHERS_ARE_WORKS = [ - 'the san diego union-tribune', - 'forbes', - 'salon', - 'san jose mercury news', - 'san jose mercury-news', - 'new york times', - 'the new york times', + '[[forbes]] (Russia)', + 'anchorage daily news', + 'arizona daily star', + 'belmontstakes.com', + 'birmingham post', + 'cbs sports', + 'daily news (new york)', + 'daily news & analysis', + 'daily news and analysis', + 'daily news egypt', + 'daily news latino', 'daily news online', + 'daily news, sri lanka', 'daily news', - 'the sun', - 'the times', - 'the star', - 'washington post', - 'the washington post', - 'the tribune', - 'los angeles times', - 'la times', - 'the la times', - 'htmlgiant', - 'the los angeles times', - 'sandiegouniontribune.com', - 'forbes.com', - 'salon.com', - 'mercurynews.com', - 'nytimes.com', - 'thedailynewsonline.com', - 'thesun.com', - 'thetimes.com', - 'thestar.com', - 'washingtonpost.com', - 'thetribune.com', - 'latimes.com', - 'htmlgiant.com', - 'the guardian', - 'fox sports', - 'mlb.com', 'espn.com', + 'findlaw.com', + 'findlaw', + 'fixtures live', + 'forbes afrique', + 'forbes asia', + 'forbes china', + 'forbes contributor blogs', + 'forbes global 2000', + 'forbes india', + 'forbes israel', + 'forbes magazine', 'forbes media', + 'forbes mexico', + 'forbes méxico', + 'forbes middle east', 'forbes online', - 'cbs sports', - 'national journal', + 'forbes russia', + 'forbes viet nam', + 'forbes việt nam', + 'forbes vietnam', + 'forbes.com', + 'forbes.ru', + 'forbes', + 'forbesmiddleeast', + 'fox sports', 'foxnews', - 'the hill', + 'foxsports.com', + 'htmlgiant.com', + 'htmlgiant', + 'india glitz', + 'indiaglitz.com', + 'indiaglitz', + 'international business times', + 'la times', + 'latimes.com', + 'los angeles times', + 'mercurynews.com', + 'mlb.com', + 'national journal', 'nationaljournal.com', - 'the huffington post', - 'the times digital archive', - 'belmontstakes.com', - 'the times archives', + 'nba.com', + 'new york daily news', + 'new york daily news', + 'new york times magazine', 'new york times.com', + 'new york times', 'news shopper', - 'birmingham post', - 'the independent', + 'newsmax', + 'nfl.com', + 'nytimes.com', + 'oneindia', + 'palm beach daily news', + 'philippine information agency', + 'philippine news agency', + 'pia.gov.ph', 'rediff.com', + 'salon.com', + 'salon', + 'san diego union tribune', + 'san jose mercury news', + 'san jose mercury-news', + 'sandiegouniontribune.com', 'squashplayer.co.uk', - 'fixtures live', - 'the star online', - 'oneindia', - 'international business times', + 'stripes', + 'the baltimore sun', + 'the daily news egypt', + 'the guardian', + 'the hill', 'the hindu', - 'daily news and analysis', - 'nfl.com', - 'foxsports.com', + 'the huffington post', + 'the independent', + 'the la times', + 'the los angeles times', + 'the new york times', 'the new yorker', - 'findlaw.com', - 'newsmax', - 'washtimes.com', - 'washington times', - 'findlaw', - 'new york times magazine', - 'stripes', - 'arizona daily star', + 'the san diego union-tribune', + 'the star (malaysia)', + 'the star online', + 'the star', + 'the sun', + 'the times archives', + 'the times digital archive', 'the times of india', 'the times-news', - 'san diego union tribune', - 'the star (malaysia)', + 'the times', + 'the tribune', + 'the washington post', + 'thedailynewsonline.com', + 'thestar.com', + 'thesun.com', + 'thetimes.com', + 'thetribune.com', 'utusan malaysia', - 'daily news, sri lanka', - 'daily news & analysis', - 'new york daily news', - 'new york daily news', - 'daily news (new york)', - 'anchorage daily news', - 'palm beach daily news', - 'daily news egypt', - 'the daily news egypt', - 'daily news latino', - 'forbes méxico', - 'forbes mexico', - 'forbes india', - 'forbesmiddleeast', - 'forbes middle east', - 'forbes russia', - 'forbes.ru', - 'forbes afrique', - 'forbes magazine', - 'forbes asia', - 'forbes israel', - 'forbes global 2000', - 'forbes china', - '[[forbes]] (Russia)', - 'forbes việt nam', - 'forbes vietnam', - 'forbes viet nam', - 'forbes contributor blogs', - 'the baltimore sun', - 'nba.com', - 'philippine news agency', + 'washington post', + 'washington times', + 'washingtonpost.com', + 'washtimes.com', 'www.pna.gov.ph', - 'pia.gov.ph', - 'philippine information agency', - 'indiaglitz', - 'india glitz', - 'indiaglitz.com', // WP:CITALICSRFC and MOS:ITALICWEBCITE ????? 'abc news' 'nbc news', 'cbs news', 'bbc news' ]; // LOWER CASE! WWW not there too! const WORKS_ARE_PUBLISHERS = ['united states census bureau']; // LOWER CASE! const DUBIOUS_JOURNALS = [ - 'oup academic', - 'fda', - 'reuters', - 'associated press', - 'ap', 'ap wire', - 'report', + 'ap', + 'associated press', + 'fda', + 'national institute of standards and technology', 'nist', + 'oup academic', + 'report', + 'reuters', 'tumblr', - 'national institute of standards and technology', ]; // Things we add, but only if publisher and agency are both blank // Catch so-called authors such as hearst magazines, time inc, nielsen business media, inc @@ -312,186 +312,229 @@ 'science', ]; const BAD_TITLES = [ - 'unknown', - 'missing', - 'arxiv e-prints', - 'arxiv mathematics e-prints', - 'ssrn electronic journal', + '403 unauthorized', + '404 not found', + '404', + '404错误', + '404页面', + 'arxiv e-prints', + 'arxiv mathematics e-prints', + 'bloomberg - are you a robot?', + 'breaking news, analysis, politics, blogs, news photos, video, tech reviews - time.com', + 'breaking news, analysis, politics, blogs, news photos, video, tech reviews', + 'cur_title', + 'digital library - pdf document', 'dissertations available from proquest', + 'download limit exceeded', 'ebscohost login', - 'library login', + 'economics working paper archive', + 'error - lexisnexis® publisher', + 'error', + 'explore census data', + 'free live sex cams', + 'google book', + 'google books', 'google groups', - 'sciencedirect', - 'cur_title', - 'wordpress › error', - 'ssrn temporarily unavailable', + 'index of /home', + 'internal server error', + 'just a moment', + 'library login', + 'loading', 'log in - proquest', - 'shibboleth authentication request', + 'missing', + 'msn', 'nookmarkable url intermediate page', - 'google books', - 'rte.ie', - 'loading', - 'google book', - 'just a moment', - 'the article you have been looking for has expired and is not longer available on our system. this is due to newswire licensing terms.', 'openid transaction in progress', - 'download limit exceeded', + 'optica publishing group', + 'oxford music online', + 'page not found', + 'pagina inicia', 'privacy settings', - 'untitled-1', - 'untitled-2', 'professional paper', - 'zbmath', - 'economics working paper archive', - 'theses and dissertations available from proquest', 'proquest ebook central', - 'report', - 'bloomberg - are you a robot?', - 'page not found', - 'free live sex cams', - 'breaking news, analysis, politics, blogs, news photos, video, tech reviews', - 'breaking news, analysis, politics, blogs, news photos, video, tech reviews - time.com', 'redirect notice', - 'oxford music online', - 'trove - archived webpage', - 'pagina inicia', - '404 not found', - '404页面', + 'redirecting', + 'report', + 'request rejected', + 'rte.ie', + 'sciencedirect', + 'shibboleth authentication request', 'sign up ', - 'index of /home', + 'ssrn electronic journal', + 'ssrn temporarily unavailable', + 'the article you have been looking for has expired and is not longer available on our system. this is due to newswire licensing terms.', + 'theses and dissertations available from proquest', + 'trove - archived webpage', + 'unknown', + 'untitled-1', + 'untitled-2', 'usa today - today\'s breaking news, us & world news', - '403 unauthorized', - '404错误', - 'internal server error', - 'error', - '404', - 'error - lexisnexis® publisher', - 'optica publishing group', - 'digital library - pdf document', - 'explore census data', - 'msn', - 'request rejected', - 'redirecting', + 'wordpress › error', + 'zbmath', ]; const IN_PRESS_ALIASES = [ + 'forthcoming', 'in press', + 'in the press', 'inpress', - 'pending', - 'published', - 'published online', - 'no-no', + 'missing', 'n/a', - 'online ahead of print', - 'unpublished', - 'unknown', - 'tba', - 'forthcoming', - 'in the press', 'na', + 'no-no', + 'online ahead of print', + 'pending', + 'published online', + 'published', 'submitted', + 'tba', 'tbd', - 'missing', + 'unknown', + 'unpublished', ]; const NON_JOURNAL_BIBCODES = [ + 'alg.geom', 'arXiv', + 'astro.ph', + 'cond.mat', + 'cs.', + 'econ.', + 'eess.', 'gr.qc', 'hep.ex', 'hep.lat', 'hep.ph', 'hep.th', - 'astro.ph', 'math', + 'nlin.', 'nucl.ex', 'nucl.th', 'physics', 'quant.ph', - 'alg.geom', - 'cond.mat', - 'cs.', - 'econ.', - 'eess.', - 'nlin.', ]; const NON_PUBLISHERS = [ + 'archive.fo', + 'archive.org', + 'archive.today', 'books.google', + 'citeseerx.ist.psu.edu', + 'google book', 'google books', 'google news', 'google.co', - 'google book', - 'zenodo', - 'archive.org', - 'citeseerx.ist.psu.edu', - 'archive.fo', - 'archive.today', 'hdl.handle.net', 'pub med', 'researchgate', + 'zenodo', ]; // Google Inc is a valid publisher, however. const BAD_ZOTERO_TITLES = [ - 'Browse publications', - 'Central Authentication Service', - 'http://', - 'https://', - 'ZbMATH - the first resource for mathematics', - 'MR: Matches for:', + ' has expired', ' Log In', - 'Log In ', - 'Sign in', - 'Bookmarkable URL intermediate page', - 'Shibboleth Authentication Request', - 'domain for sale', - 'website for sale', - 'domain is for sale', - 'website is for sale', - 'lease this domain', - 'domain available', - 'metaTags', - 'An Error Occurred', - 'User Cookie', - 'Cookies Disabled', - 'page not found', - '411 error', - 'url not found', - 'limit exceeded', - 'Error Page', - '}}', - '{{', - 'EU Login', - 'bad gateway', - 'Captcha', + ' Stranica nije pronađena', + '..::.. Error', '.com', '.gov', '.org', - 'View PDF', - 'Wayback Machine', - 'does not exist', - 'Subscribe to read', - 'Wiley Online Library', - 'pagina is niet gevonden', - 'Zoeken in over NA', - 'na een 404', + '{{', + '}}', + '404 - - ', + '404 - ', + '404 - File or directory not found', + '404 - Not Found', + '404 - Page Not Found', + '404 - URL invalid', + '404 | ', + '404 | ', '404 error', + '404 ERROR', + '404 Error', + '404 Not Found', + '404 Page - ', + '404 Page', + '404: Page Not Found ', + '404: PAGE NOT FOUND', + '404!', + '404. That\'s an error', + '404. The page', + '404. The URL', + '404エラ', + '404页面', + '411 error', + '500 Internal Server Error', + 'Aanmelden bij Facebook', + 'Aanmelden of registreren om te bekijken', 'Account Suspended', + 'An Error Has Occured', + 'an error has occurred', + 'An error occured', + 'An Error Occurred', + 'Are you a robot', + 'Article expired', + 'bad gateway', + 'being redirected', + 'bluehost.com', + 'Bookmarkable URL intermediate page', + 'Browse publications', + 'Captcha', + 'Central Authentication Service', + 'Compare Payday Loans', + 'Connecting to the iTunes Store', + 'cookie settings', + 'Cookies Disabled', + 'De pagina is niet gevonden', + 'Document unavailable', + 'does not exist', + 'domain available', + 'domain for sale', + 'domain is for sale', + 'DomainMarket.com', + 'Download Limit Exceeded', + 'DPG Media Privacy Gate', + 'EBSCOhost Login', + 'Einloggen', 'Error 404', + 'Error Page', + 'EU Login', 'EZProxy', - 'EBSCOhost Login', - '404 - Not Found', - '404!', - 'Temporarily Unavailable', - ' has expired', + 'Find the Best Loan Deal', + 'has been registered', + 'has now officially closed', + 'Help Center - The Arizona Republic', + 'http://', + 'https://', + 'Internet Archive Wayback Machine', + 'lease this domain', + 'limit exceeded', + 'Log In ', + 'Login • Instagram', + 'metaTags', + 'Missing page', + 'MR: Matches for:', + 'na een 404', 'not longer available', - 'Article expired', - 'This is due to newswire licensing terms', - 'DPG Media Privacy Gate', 'OpenId transaction in progress', - 'Download Limit Exceeded', - 'Internet Archive Wayback Machine', - 'Url(アドレス)が変わりました', - '404エラ', - 'お探しのページは見つかりませんでした', + 'Page non trouvée', + 'page not found', + 'Page Not Found', + 'Pagina inicia', + 'pagina is niet gevonden', + 'Página não existe', + 'Página no encontrada', + 'Preview unavailable', 'privacy settings', - 'cookie settings', - 'WebCite query', - 'Ой!', + 'register to view', + 'Seite nicht gefunden', + 'sex cams', + 'Shibboleth Authentication Request', + 'ShieldSquare', + 'Sign in', + 'Sign up | LinkedIn', + 'Subscribe to read', + 'subscriber to read', + 'Temporarily Unavailable', + 'The Times & the Sunday Times', + 'The-star.co.kr', + 'This is due to newswire licensing terms', + 'This is not the page you requested', 'Untitled-1', 'Untitled-2', 'Untitled-3', @@ -501,92 +544,50 @@ 'Untitled-7', 'Untitled-8', 'Untitled-9', - 'Are you a robot', - 'Aanmelden of registreren om te bekijken', - 'register to view', - 'being redirected', - 'has been registered', - 'Aanmelden bij Facebook', - 'Einloggen', - 'The Times & the Sunday Times', - 'Login • Instagram', - 'subscriber to read', - 'has now officially closed', - 'An Error Has Occured', - 'an error has occurred', + 'url not found', + 'Url(アドレス)が変わりました', + 'User Cookie', + 'Validate User', + 'View PDF', + 'wasn\'t found on this server', + 'Wayback Machine', + 'WebCite query', + 'website for sale', + 'website is for sale', + 'Wiley Online Library', 'YouTube, a Google company', - 'Seite nicht gefunden', - 'Página no encontrada', + 'ZbMATH - the first resource for mathematics', + 'Zoeken in over NA', + 'Ой!', + 'страница не найдена', 'الصفحة غير موجودة', + 'お探しのページは見つかりませんでした', '找不到网页', - 'страница не найдена', - 'Page non trouvée', - 'An error occured', - 'Compare Payday Loans', - 'Find the Best Loan Deal', - '..::.. Error', - 'Pagina inicia', - 'Help Center - The Arizona Republic', - '404 ERROR', - '404 - URL invalid', - '404. That\'s an error', - '404 - Page Not Found', - 'Página não existe', - 'This is not the page you requested', - 'Page Not Found', - '404 - - ', - 'sex cams', - '404 | ', - 'Missing page', - '404 - File or directory not found', '错误页面', - '404 Page - ', - '404: Page Not Found ', - '404: PAGE NOT FOUND', - '404 Error', - '404 | ', '页面不存在', - 'De pagina is niet gevonden', - '404 - ', - ' Stranica nije pronađena', - '404 Page', - '404. The page', - 'wasn\'t found on this server', - '404. The URL', - 'ShieldSquare', - '404 Not Found', - '404页面', - 'Sign up | LinkedIn', - 'The-star.co.kr', - 'Connecting to the iTunes Store', - '500 Internal Server Error', - 'DomainMarket.com', - 'bluehost.com', - 'Validate User', - 'Document unavailable', - 'Preview unavailable', ]; const CANONICAL_PUBLISHER_URLS = [ + '-ezproxy.', '.acm.org', - 'archivespp.pl', - 'radicalphilosophy.com', + '.ebscohost.com', '.erudit.org', - 'metropolitics.org', - 'metropolitiques.eu', - 'revistas.upr.edu', - '.tci-thaijo.org', - 'tidsskrift.dk', - 'observatoria.rsl.ru', - 'britishartstudies.ac.uk', + '.ezproxy.', '.oup.com', + '.serialssolutions.com', + '.tci-thaijo.org', + '/ezproxy.', 'academic.oup.com', 'aeaweb.org', 'aip.scitation.org', 'amjbot.org', 'annualreviews.org', + 'arc.aiaa.org', + 'archivespp.pl', 'biomedcentral.com', + 'bmcr.brynmawr.edu', 'bmj.com/cgi/pmidlookup', + 'britishartstudies.ac.uk', 'cambridge.org', 'cell.com', 'chestjournal.org', @@ -618,22 +619,27 @@ 'jwildlifedis.org', 'macmillan.com', 'mdpi.com', + 'metropolitics.org', + 'metropolitiques.eu', 'msptm.org', + 'muse.jhu.edu', 'nature.com', 'nrcresearchpress.', + 'observatoria.rsl.ru', 'pnas.org', + 'proxy.lib.', + 'proxy.libraries', 'psyche.entclub.org', 'psycnet.apa.org', 'publications.aap.org', 'pubs.geoscienceworld.org', - 'muse.jhu.edu', 'pubs.rsc.org', - 'xlink.rsc.org', + 'pyglobal.com', + 'radicalphilosophy.com', + 'revistas.upr.edu', 'sagepub.com', 'sagepublications.com', 'scholarpedia.org', - 'arc.aiaa.org', - 'bmcr.brynmawr.edu', 'schweizerbart.de', 'scielo.br', 'scielo.org', @@ -641,22 +647,16 @@ 'sciencemag.org', 'springer.com', 'tandfonline.com', - 'pyglobal.com', 'taylorandfrancis.com', 'thelancet.com', + 'tidsskrift.dk', 'vertpala.ac.cn', 'wiley.com', 'wjgnet.com', 'worldscientific.com', + 'xlink.rsc.org', // Below are journal search engines - '.serialssolutions.com', - '.ebscohost.com', // Below are proxys - 'proxy.libraries', - 'proxy.lib.', - '.ezproxy.', - '-ezproxy.', - '/ezproxy.', // Below are sites that are simply DOI resolvers, like dx.doi.org 'doi.library.ubc.ca', ]; @@ -665,359 +665,359 @@ 'digitalcommons.colby.edu', 'ecommons.luc.edu', 'hal.science', + 'numdam.org', 'perspectivia.net', 'shs.cairn.info', - 'zaguan.unizar.es', - 'numdam.org', 'works.bepress.com', + 'zaguan.unizar.es', ]; const PROXY_HOSTS_TO_ALWAYS_DROP = [ - 'proxy.libraries', - 'proxy.lib.', - '.ezproxy.', '-ezproxy.', - '/ezproxy.', + '.ezproxy.', + '.idm.oclc.org', '.serialssolutions.com', - 'search.ebscohost.com', + '/ezproxy.', 'findarticles.com', 'journals.royalsociety.org', - '.idm.oclc.org', + 'proxy.lib.', + 'proxy.libraries', + 'search.ebscohost.com', ]; // Drop these if there is a valid DOI const PROXY_HOSTS_TO_DROP = [ - 'proxy.libraries', - 'proxy.lib.', - '.ezproxy.', '-ezproxy.', - '/ezproxy.', - '.serialssolutions.com', '.ebscohost.com', - 'linkinghub.elsevier.com', - 'doi.library.ubc.ca', - 'ingentaconnect.com/content', - 'sciencedirect.com/science?_ob', - 'informaworld.com/smpp', + '.ezproxy.', '.search.serialssolutions.com', + '.serialssolutions.com', + '/ezproxy.', + 'delivery.acm.org', 'doi.apa.org', - 'onlinelibrary.wiley.com/resolve/openurl', + 'doi.library.ubc.ca', 'findarticles.com', + 'informaworld.com/smpp', + 'ingentaconnect.com/content', + 'linkinghub.elsevier.com', + 'onlinelibrary.wiley.com/resolve/openurl', + 'proxy.lib.', + 'proxy.libraries', 'psycnet.apa.org', - 'delivery.acm.org', + 'sciencedirect.com/science?_ob', ]; // Drop these if there is a valid FREE DOI const WEB_NEWSPAPERS = [ + 'abante', + 'argus leader', + 'austin daily herald', + 'balita (newspaper)', + 'balita.net.ph', + 'bandera.inquirer.net', 'bbc news', - 'bbc', - 'news.bbc.co.uk', - 'bbc sports', 'bbc sport', - 'www.bbc.co.uk', - 'the economist', - 'washington post', - 'philippine daily inquirer', - 'www.inquirer.net', + 'bbc sports', + 'bbc', + 'bozeman daily chronicle', + 'businessmirror.com.ph', + 'businessmirror', + 'businessworld', + 'chicago tribune', + 'christian science monitor', + 'csm', + 'csmonitor.com', + 'daily tribune (philippines)', + 'inquirer bandera', + 'irishtimes.com', + 'journal.com.ph', + 'la crosse tribune', + 'malaya (newspaper)', + 'malaya.com.ph', 'manila bulletin', - 'mb.com.ph', - 'the philippine star', - 'www.philstar.com', - 'the manila times', - 'www.manilatimes.net', 'manila standard', 'manilastandard.net', - 'sunstar', - 'www.sunstar.com.ph', - 'malaya (newspaper)', - 'malaya.com.ph', - 'daily tribune (philippines)', - 'tribune.net.ph', - 'businessworld', - 'www.bworldonline.com', - 'businessmirror', - 'businessmirror.com.ph', - 'united news', - 'www.unitednews.net.ph', + 'mb.com.ph', 'mindanao gold star daily', 'mindanaogoldstardaily.com', - 'tempo (newspaper)', - 'www.tempo.com.ph', + 'news.bbc.co.uk', 'people\'s journal', - 'journal.com.ph', - 'abante', - 'www.abante.com.ph', - 'balita (newspaper)', - 'balita.net.ph', - 'inquirer bandera', - 'bandera.inquirer.net', + 'philippine daily inquirer', 'pilipino star ngayon', - 'irishtimes.com', + 'rapid city journal', + 'rochester democrat and chronicle', + 'sunstar', + 'tempo (newspaper)', + 'the boston globe', + 'the economist', + 'the indianapolis news', 'the irish times', - 'the seattle times', + 'the manila times', + 'the missoulian', + 'the montana standard', + 'the new york times', 'the news tribune', + 'the philippine star', + 'the seattle times', 'the spokesman-review', - 'the montana standard', - 'the missoulian', 'the spokesman-review', - 'bozeman daily chronicle', - 'the new york times', - 'argus leader', 'the washington post', - 'rapid city journal', - 'austin daily herald', - 'la crosse tribune', - 'chicago tribune', - 'christian science monitor', - 'csm', - 'csmonitor.com', - 'rochester democrat and chronicle', - 'the boston globe', - 'the indianapolis news', + 'tribune.net.ph', + 'united news', + 'washington post', + 'www.abante.com.ph', + 'www.bbc.co.uk', + 'www.bworldonline.com', + 'www.inquirer.net', + 'www.manilatimes.net', + 'www.philstar.com', + 'www.sunstar.com.ph', + 'www.tempo.com.ph', + 'www.unitednews.net.ph', ]; const HOSTS_TO_NOT_ADD = [ + 'doi.org', + 'dx.doi.org', 'this.fails', 'www.ncbi.nlm.nih.gov', - 'dx.doi.org', - 'doi.org', + 'hdl.handle.net', ]; const HOSTNAME_MAP = [ - 'moviecrow.com' => 'MovieCrow', - 'public.ebookcentral.proquest.com' => '[[ProQuest]]', - 'proquest.com' => '[[ProQuest]]', - 'search.proquest.com' => '[[ProQuest]]', - 'cnn.com' => '[[CNN]]', - 'foxnews.com' => '[[Fox News]]', - 'msnbc.com' => '[[MSNBC]]', - 'nbcnews.com' => '[[NBC News]]', - 'cbsnews.com' => '[[CBS News]]', - 'cbs.com' => '[[CBS]]', + 'abante.com.ph' => '[[Abante]]', 'abc.com' => '[[American Broadcasting Company]]', - 'nytimes.com' => '[[The New York Times]]', - 'newyorker.com' => '[[The New Yorker]]', - 'independent.co.uk' => '[[Independent.co.uk]]', - 'cnbc.com' => '[[CNBC]]', - 'theatlantic.com' => '[[The Atlantic]]', - 'jpl.nasa.gov' => '[[Jet Propulsion Laboratory]]', - 'latimes.com' => '[[Los Angeles Times]]', - 'reuters.com' => '[[Reuters]]', - 'dailynews.com' => '[[Los Angeles Daily News]]', - 'gsfc.nasa.gov' => '[[Goddard Space Flight Center]]', - 'newsweek.com' => '[[Newsweek]]', - 'observer.com' => '[[The New York Observer]]', - 'pbs.org' => '[[PBS]]', - 'mediaincanada.com' => '[[Media of Canada]]', - 'zap2it.com' => '[[Zap2it]]', - 'fda.gov' => '[[Food and Drug Administration]]', - 'rte.ie' => '[[RTÉ.ie]]', - 'rockpapershotgun.com' => '[[Rock Paper Shotgun]]', - 'gameinformer.com' => '[[Game Informer]]', - 'lemonde.fr' => '[[Le Monde]]', - 'pcgamer.com' => '[[PC Gamer]]', - 'metacritic.com' => '[[Metacritic]]', - 'redbull.com' => '[[Red Bull]]', + 'abc.net.au' => '[[Australian Broadcasting Corporation]]', 'abcnews.com' => '[[ABC News (United States)|ABC News]]', 'abcnews.go.com' => '[[ABC News (United States)|ABC News]]', + 'acharts.us' => 'αCharts', + 'alfred.com' => 'Alfred Music', 'allmusic.com' => '[[AllMusic]]', + 'altpress.com' => '[[Alternative Press (magazine)|Alternative Press]]', + 'amazon.co.uk' => 'Amazon UK', + 'amazon.com' => 'Amazon', + 'amazon.de' => 'Amazon Germany', 'ancestry.com' => '[[Ancestry.com]]', + 'animaldiversity.org' => '[[Animal Diversity Web]]', 'answers.com' => '[[Answers.com]]', + 'antena3.com' => '[[Antena 3 (Spanish TV channel)|Antena 3]]', 'ap.org' => '[[Associated Press]]', 'apnews.com' => '[[Associated Press News]]', + 'articles.latimes.com' => '[[Los Angeles Times]]', 'avclub.com' => '[[The A.V. Club]]', + 'bac-lac.gc.ca' => '[[Library and Archives Canada]]', 'baidu.com' => '[[Baidu]]', - 'sohu.com' => '[[Sohu]]', + 'balita.net.ph' => '[[Balita (newspaper)|Balita]]', + 'bandera.inquirer.net' => '[[Inquirer Bandera|Bandera]]', + 'beatport.com' => '[[Beatport]]', + 'bet.com' => '[[BET]]', + 'billboard.com' => '[[Billboard (magazine)|Billboard]]', + 'bleacherreport.com' => '[[Bleacher Report]]', 'bloomberg.com' => '[[Bloomberg News]]', + 'bollywoodhungama.com' => '[[Bollywood Hungama]]', + 'bostonglobe.com' => '[[The Boston Globe]]', + 'boxlifemagazine.com' => 'BoxLife', 'breitbart.com' => '[[Breitbart News]]', 'businessinsider.com' => '[[Business Insider]]', + 'businessmirror.com.ph' => '[[BusinessMirror]]', 'buzzfeed.com' => '[[BuzzFeed]]', 'buzzfeednews.com' => '[[BuzzFeed News]]', + 'buzzjack.com' => 'BuzzJack', + 'bworldonline.com' => '[[BusinessWorld]]', + 'capitalxtra.com' => 'Capital XTRA', + 'cbs.com' => '[[CBS]]', + 'cbsnews.com' => '[[CBS News]]', + 'cdjapan.co.jp' => 'CDJapan', + 'chicagotribune.com' => '[[Chicago Tribune]]', + 'cinemaexpress.com' => '[[Cinema Express]]', + 'classicbands.com' => 'ClassicBands.com', + 'cnbc.com' => '[[CNBC]]', + 'cnn.com' => '[[CNN]]', + 'collider.com' => '[[Collider (website)|Collider]]', + 'comedy.co.uk' => '[[British Comedy Guide]]', + 'complex.com' => '[[Complex Networks]]', 'csmonitor.com' => '[[The Christian Science Monitor]]', - 'rollingstone.com' => '[[Rolling Stone]]', 'dailydot.com' => '[[The Daily Dot]]', - 'dailymail.com' => '[[Daily Mail]]', 'dailymail.co.uk' => '[[Daily Mail]]', - 'deseretnews.com' => '[[Deseret News]]', + 'dailymail.com' => '[[Daily Mail]]', + 'dailynews.com' => '[[Los Angeles Daily News]]', + 'dailysabah.com' => '[[Daily Sabah]]', 'dailywire.com' => '[[The Daily Wire]]', 'democracynow.org' => '[[Democracy Now!]]', + 'deseretnews.com' => '[[Deseret News]]', + 'desmoinesregister.com' => '[[The Des Moines Register]]', + 'digitalspy.co.uk' => '[[Digital Spy]]', + 'digitalspy.com' => '[[Digital Spy]]', + 'discogs.com' => '[[Discogs]]', + 'dtnext.in' => '[[DT Next]]', + 'dw.com' => '[[Deutsche Welle]]', 'economist.com' => '[[The Economist]]', - 'facebook.com' => '[[Facebook]]', + 'edge-online.com' => 'Edge', + 'edweek.org' => '[[Education Week]]', + 'elitefts.com' => 'elitefts', 'epochtimes.com' => '[[The Epoch Times]]', + 'espnscrum.com' => '[[ESPNscrum]]', + 'eurogamer.net' => '[[Eurogamer]]', + 'eurokdj.com' => 'Eurodance Encyclopaedia', + 'ew.com' => '[[Entertainment Weekly]]', 'examiner.com' => '[[Examiner.com]]', + 'facebook.com' => '[[Facebook]]', 'familysearch.org' => '[[FamilySearch]]', + 'fda.gov' => '[[Food and Drug Administration]]', 'findagrave.com' => '[[Find a Grave]]', 'forbes.com' => '[[Forbes]]', 'foxbusiness.com' => '[[Fox Business]]', + 'foxnews.com' => '[[Fox News]]', + 'fuse.tv' => '[[Fuse (TV channel)|Fuse]]', + 'gameinformer.com' => '[[Game Informer]]', + 'github.com' => '[[GitHub]]', + 'gsfc.nasa.gov' => '[[Goddard Space Flight Center]]', + 'happygamer.com' => 'Happy Gamer', 'hollywoodreporter.com' => '[[The Hollywood Reporter]]', - 'screenrant.com' => '[[Screen Rant]]', + 'hotnewhiphop.com' => 'HNHH', + 'hroarr.com' => 'HROARR', 'huffingtonpost.com' => '[[HuffPost]]', + 'ibm.com' => '[[IBM]]', 'ibtimes.co.in' => '[[International Business Times]]', 'ibtimes.com' => '[[International Business Times]]', + 'ieee.org' => '[[Institute of Electrical and Electronics Engineers]] (IEEE)', 'imdb.com' => '[[IMDb]]', + 'independent.co.uk' => '[[Independent.co.uk]]', + 'inquirer.net' => '[[Philippine Daily Inquirer]]', + 'insider.com' => '[[Insider.com]]', + 'irishtimes.com' => '[[The Irish Times]]', + 'itunes.apple.com' => '[[iTunes]]', + 'jewishweek.timesofisrael.com' => '[[The Jewish Week]]', + 'journal.com.ph' => '[[People\'s Journal]]', + 'jpl.nasa.gov' => '[[Jet Propulsion Laboratory]]', + 'latimes.com' => '[[Los Angeles Times]]', + 'latino.foxnews.com' => '[[Fox News]]', + 'legacy.com' => '[[Legacy.com]]', + 'lemonde.fr' => '[[Le Monde]]', + 'livescience.com' => '[[Live Science]]', + 'loc.gov' => '[[Library of Congress]]', + 'loudwire.com' => '[[Loudwire]]', + 'malaya.com.ph' => '[[Malaya (newspaper)|Malaya]]', + 'manilastandard.net' => '[[Manila Standard]]', + 'manilatimes.net' => '[[The Manila Times]]', 'mashable.com' => '[[Mashable]]', + 'mayoclinic.org' => '[[Mayo Clinic]]', + 'mb.com.ph' => '[[Manila Bulletin]]', + 'mediaincanada.com' => '[[Media of Canada]]', 'mediamatters.org' => '[[Media Matters for America]]', + 'mensfitness.co.uk' => 'Men\'s Fitness', + 'metacritic.com' => '[[Metacritic]]', + 'miamiherald.com' => '[[Miami Herald]]', + 'microsoft.com' => '[[Microsoft]]', + 'mindanaogoldstardaily.com' => '[[Mindanao Gold Star Daily]]', 'mirror.co.uk' => '[[Daily Mirror]]', + 'mlb.com' => '[[MLB.com]]', + 'moviecrow.com' => 'MovieCrow', + 'msn.com' => '[[MSN]]', + 'msnbc.com' => '[[MSNBC]]', + 'mtv.com' => '[[MTV]]', + 'mtv.de' => 'MTV Germany', + 'musicnotes.com' => 'Musicnotes', 'nationalgeographic.com' => '[[National Geographic Society]]', 'nationalreview.com' => '[[National Review]]', + 'nba.com' => '[[NBA.com]]', + 'nbcnews.com' => '[[NBC News]]', + 'netflix.com' => '[[Netflix]]', 'newrepublic.com' => '[[The New Republic]]', + 'newsweek.com' => '[[Newsweek]]', + 'newyorker.com' => '[[The New Yorker]]', + 'nfl.com' => '[[NFL.com]]', + 'nme.com' => '[[NME]]', + 'noise11.com' => 'noise11.com', 'npr.org' => '[[NPR]]', 'nydailynews.com' => '[[New York Daily News]]', + 'nytimes.com' => '[[The New York Times]]', + 'observer.com' => '[[The New York Observer]]', + 'officialcharts.com' => '[[Official Charts Company|Official Charts]]', + 'open.spotify.com' => '[[Spotify]]', + 'pbs.org' => '[[PBS]]', + 'pcgamer.com' => '[[PC Gamer]]', + 'philstar.com' => '[[The Philippine STAR]]', + 'philstar.com/pilipino-star-ngayon' => '[[Pilipino Star Ngayon]]', + 'pia.gov.ph' => 'Philippine Information Agency', + 'pitchfork.com' => '[[Pitchfork (website)|Pitchfork]]', + 'pna.gov.ph' => 'Philippine News Agency', 'politico.com' => '[[Politico]]', + 'polygon.com' => '[[Polygon (website)|Polygon]]', + 'pro-football-reference.com' => '[[Pro-Football-Reference.com]]', + 'proquest.com' => '[[ProQuest]]', + 'providencephoenix.com' => 'Providence Phoenix', + 'public.ebookcentral.proquest.com' => '[[ProQuest]]', + 'queenvault.com' => 'Queen Vault', + 'ratingsryan.com' => 'Ratings Ryan', + 'redbull.com' => '[[Red Bull]]', + 'reuters.com' => '[[Reuters]]', + 'riaa.com' => '[[Recording Industry Association of America]]', + 'rnz.co.nz' => '[[Radio New Zealand]]', + 'rockpapershotgun.com' => '[[Rock Paper Shotgun]]', + 'rocksmith.ubi.com' => 'Rocksmith+', + 'rollingstone.com' => '[[Rolling Stone]]', 'rottentomatoes.com' => '[[Rotten Tomatoes]]', + 'royal.uk' => 'The Royal Family', + 'rte.ie' => '[[RTÉ.ie]]', 'scientificamerican.com' => '[[Scientific American]]', + 'screenrant.com' => '[[Screen Rant]]', + 'search.proquest.com' => '[[ProQuest]]', + 'secondhandsongs.com' => 'SecondHandSongs', + 'seenews.com' => 'SeeNews', + 'setlist.fm' => 'setlist.fm', + 'sheetmusicnow.com' => 'Sheet Music Now', + 'showbuzzdaily.com' => '[[Showbuzzdaily.com]]', + 'sify.com' => '[[Sify]]', + 'slantmagazine.com' => '[[Slant Magazine]]', + 'sohu.com' => '[[Sohu]]', + 'songlines.co.uk' => '[[Songlines (magazine)|Songlines]]', + 'space.com' => '[[Space.com]]', + 'startribune.com' => '[[Star Tribune]]', + 'strategicmanagementinsight.com' => 'StrategicManagementInsight.com', + 'sunstar.com.ph' => '[[SunStar]]', + 'tempo.com.ph' => '[[Tempo (newspaper)|Tempo]]', + 'tennessean.com' => '[[The Tennessean]]', + 'theathletic.com' => '[[The Athletic]]', + 'theatlantic.com' => '[[The Atlantic]]', + 'theboot.com' => 'The Boot', 'thedailybeast.com' => '[[The Daily Beast]]', 'theepochtimes.com' => '[[The Epoch Times]]', 'theglobeandmail.com' => '[[The Globe and Mail]]', 'theguardian.com' => '[[TheGuardian.com]]', 'thehindu.com' => '[[The Hindu]]', 'theonion.com' => '[[The Onion]]', + 'theprint.in' => '[[ThePrint]]', 'theregister.co.uk' => '[[The Register]]', - 'thetimes.co.uk' => '[[The Times]]', + 'thestar.com' => '[[Toronto Star]]', + 'thestatesman.com' => '[[The Statesman (India)|The Statesman]]', 'thesundaytimes.co.uk' => '[[The Sunday Times]]', + 'thetimes.co.uk' => '[[The Times]]', + 'theweek.co.uk' => '[[The Week]]', + 'theweek.com' => '[[The Week]]', + 'theweek.in' => '[[The Week (Indian magazine)|The Week]]', 'thinkprogress.org' => '[[ThinkProgress]]', 'timesofindia.com' => '[[The Times of India]]', 'timesofindia.indiatimes.com' => '[[The Times of India]]', - 'usatoday.com' => '[[USA Today]]', - 'urbandictionary.com' => '[[Urban Dictionary]]', - 'washingtonpost.com' => '[[The Washington Post]]', - 'washingtontimes.com' => '[[The Washington Times]]', - 'msn.com' => '[[MSN]]', - 'microsoft.com' => '[[Microsoft]]', - 'youtube.com' => '[[YouTube]]', - 'zdnet.com' => '[[ZDNet]]', - 'netflix.com' => '[[Netflix]]', - 'chicagotribune.com' => '[[Chicago Tribune]]', - 'bostonglobe.com' => '[[The Boston Globe]]', - 'bleacherreport.com' => '[[Bleacher Report]]', 'timesofisrael.com' => '[[The Times of Israel]]', - 'jewishweek.timesofisrael.com' => '[[The Jewish Week]]', - 'miamiherald.com' => '[[Miami Herald]]', - 'desmoinesregister.com' => '[[The Des Moines Register]]', - 'thestar.com' => '[[Toronto Star]]', - 'tennessean.com' => '[[The Tennessean]]', - 'startribune.com' => '[[Star Tribune]]', - 'comedy.co.uk' => '[[British Comedy Guide]]', - 'legacy.com' => '[[Legacy.com]]', - 'slantmagazine.com' => '[[Slant Magazine]]', - 'nme.com' => '[[NME]]', - 'eurokdj.com' => 'Eurodance Encyclopaedia', - 'videostatic.com' => 'VideoStatic', - 'discogs.com' => '[[Discogs]]', - 'musicnotes.com' => 'Musicnotes', - 'amazon.com' => 'Amazon', - 'amazon.de' => 'Amazon Germany', - 'amazon.co.uk' => 'Amazon UK', 'tophit.ru' => '[[Tophit]]', - 'acharts.us' => 'αCharts', - 'bet.com' => '[[BET]]', - 'mtv.com' => '[[MTV]]', - 'ultratop.be' => '[[Ultratop]]', - 'billboard.com' => '[[Billboard (magazine)|Billboard]]', - 'officialcharts.com' => '[[Official Charts Company|Official Charts]]', - 'loudwire.com' => '[[Loudwire]]', - 'altpress.com' => '[[Alternative Press (magazine)|Alternative Press]]', - 'noise11.com' => 'noise11.com', - 'bac-lac.gc.ca' => '[[Library and Archives Canada]]', - 'itunes.apple.com' => '[[iTunes]]', - 'cdjapan.co.jp' => 'CDJapan', - 'rocksmith.ubi.com' => 'Rocksmith+', - 'setlist.fm' => 'setlist.fm', - 'digitalspy.co.uk' => '[[Digital Spy]]', - 'digitalspy.com' => '[[Digital Spy]]', - 'buzzjack.com' => 'BuzzJack', - 'open.spotify.com' => '[[Spotify]]', - 'whosampled.com' => '[[WhoSampled]]', - 'fuse.tv' => '[[Fuse (TV channel)|Fuse]]', - 'collider.com' => '[[Collider (website)|Collider]]', - 'space.com' => '[[Space.com]]', - 'github.com' => '[[GitHub]]', - 'strategicmanagementinsight.com' => 'StrategicManagementInsight.com', - 'edge-online.com' => 'Edge', - 'boxlifemagazine.com' => 'BoxLife', - 'hroarr.com' => 'HROARR', - 'mensfitness.co.uk' => 'Men\'s Fitness', - 'riaa.com' => '[[Recording Industry Association of America]]', - 'secondhandsongs.com' => 'SecondHandSongs', - 'alfred.com' => 'Alfred Music', - 'capitalxtra.com' => 'Capital XTRA', - 'theboot.com' => 'The Boot', - 'ew.com' => '[[Entertainment Weekly]]', - 'antena3.com' => '[[Antena 3 (Spanish TV channel)|Antena 3]]', - 'complex.com' => '[[Complex Networks]]', - 'articles.latimes.com' => '[[Los Angeles Times]]', - 'providencephoenix.com' => 'Providence Phoenix', - 'queenvault.com' => 'Queen Vault', - 'classicbands.com' => 'ClassicBands.com', - 'seenews.com' => 'SeeNews', - 'mtv.de' => 'MTV Germany', - 'eurogamer.net' => '[[Eurogamer]]', - 'happygamer.com' => 'Happy Gamer', - 'polygon.com' => '[[Polygon (website)|Polygon]]', - 'edweek.org' => '[[Education Week]]', - 'hotnewhiphop.com' => 'HNHH', - 'beatport.com' => '[[Beatport]]', - 'pitchfork.com' => '[[Pitchfork (website)|Pitchfork]]', - 'sheetmusicnow.com' => 'Sheet Music Now', - 'mayoclinic.org' => '[[Mayo Clinic]]', - 'elitefts.com' => 'elitefts', - 'loc.gov' => '[[Library of Congress]]', - 'livescience.com' => '[[Live Science]]', - 'insider.com' => '[[Insider.com]]', - 'latino.foxnews.com' => '[[Fox News]]', - 'dw.com' => '[[Deutsche Welle]]', - 'rnz.co.nz' => '[[Radio New Zealand]]', - 'pro-football-reference.com' => '[[Pro-Football-Reference.com]]', - 'nba.com' => '[[NBA.com]]', - 'mlb.com' => '[[MLB.com]]', - 'nfl.com' => '[[NFL.com]]', - 'showbuzzdaily.com' => '[[Showbuzzdaily.com]]', - 'ratingsryan.com' => 'Ratings Ryan', - 'vanityfair.com' => '[[Vanity Fair (magazine)|Vanity Fair]]', - 'songlines.co.uk' => '[[Songlines (magazine)|Songlines]]', - 'abc.net.au' => '[[Australian Broadcasting Corporation]]', - 'espnscrum.com' => '[[ESPNscrum]]', - 'thestatesman.com' => '[[The Statesman (India)|The Statesman]]', - 'dailysabah.com' => '[[Daily Sabah]]', - 'ibm.com' => '[[IBM]]', - 'ieee.org' => '[[Institute of Electrical and Electronics Engineers]] (IEEE)', - 'bandera.inquirer.net' => '[[Inquirer Bandera|Bandera]]', - 'inquirer.net' => '[[Philippine Daily Inquirer]]', - 'mb.com.ph' => '[[Manila Bulletin]]', - 'philstar.com/pilipino-star-ngayon' => '[[Pilipino Star Ngayon]]', - 'philstar.com' => '[[The Philippine STAR]]', - 'manilatimes.net' => '[[The Manila Times]]', - 'manilastandard.net' => '[[Manila Standard]]', - 'sunstar.com.ph' => '[[SunStar]]', - 'malaya.com.ph' => '[[Malaya (newspaper)|Malaya]]', 'tribune.net.ph' => '[[Daily Tribune (Philippines)|Daily Tribune]]', - 'bworldonline.com' => '[[BusinessWorld]]', - 'businessmirror.com.ph' => '[[BusinessMirror]]', + 'ultratop.be' => '[[Ultratop]]', 'unitednews.net.ph' => '[[United News]]', - 'mindanaogoldstardaily.com' => '[[Mindanao Gold Star Daily]]', - 'tempo.com.ph' => '[[Tempo (newspaper)|Tempo]]', - 'journal.com.ph' => '[[People\'s Journal]]', - 'abante.com.ph' => '[[Abante]]', - 'balita.net.ph' => '[[Balita (newspaper)|Balita]]', - 'pna.gov.ph' => 'Philippine News Agency', - 'pia.gov.ph' => 'Philippine Information Agency', - 'irishtimes.com' => '[[The Irish Times]]', + 'urbandictionary.com' => '[[Urban Dictionary]]', + 'usatoday.com' => '[[USA Today]]', + 'vanityfair.com' => '[[Vanity Fair (magazine)|Vanity Fair]]', + 'videostatic.com' => 'VideoStatic', + 'washingtonpost.com' => '[[The Washington Post]]', + 'washingtontimes.com' => '[[The Washington Times]]', + 'whosampled.com' => '[[WhoSampled]]', 'wikinews.org' => '[[Wikinews]]', - 'royal.uk' => 'The Royal Family', - 'cinemaexpress.com' => '[[Cinema Express]]', - 'sify.com' => '[[Sify]]', - 'animaldiversity.org' => '[[Animal Diversity Web]]', - 'bollywoodhungama.com' => '[[Bollywood Hungama]]', - 'dtnext.in' => '[[DT Next]]', - 'theweek.co.uk' => '[[The Week]]', - 'theweek.com' => '[[The Week]]', - 'theweek.in' => '[[The Week (Indian magazine)|The Week]]', - 'theathletic.com' => '[[The Athletic]]', - 'theprint.in' => '[[ThePrint]]', + 'youtube.com' => '[[YouTube]]', + 'zap2it.com' => '[[Zap2it]]', + 'zdnet.com' => '[[ZDNet]]', ]; // Be warned, some website host a seperate sunday edition, etc. Be careful and when in doubt link to hostname const NO_DATE_WEBSITES = [ - 'chairs-chaires.gc.ca', - 'apopo.org', 'allmusic.com', + 'apopo.org', 'apps.des.qld.gov.au', 'archive-it.org', 'archive.fo', @@ -1033,6 +1033,7 @@ 'brema.suub.uni-bremen.de', 'britannica.com', 'catalogue.bnf.fr', + 'chairs-chaires.gc.ca', 'chinesefilmclassics.org', 'chroniclingamerica.loc.gov', 'collections.louvre.fr', @@ -1084,74 +1085,72 @@ ]; const ZOTERO_AVOID_REGEX = [ - 'twitter\.', // This should be {{cite tweet}} - // Zotero seems to be doing better now 'youtube\.', 'youtu\.be', + 'arkive\.org', + 'australian-charts\.com', // Fails 100% + 'biodiversity\.org\.au', // Zotero gives bad data + 'bloomberg\.com/tosv2.html', // Junk 'books\.google\.', // We have special google books code + 'britishnewspaperarchive\.co\.uk', // Requires registration + 'cagematch\.net', // Fails 100% + 'censusindia\.gov\.in', // Fails 100% + 'charts\.nz', // Fails 100% + 'ebooks\.adelaide\.edu\.au', // Dead + 'elonet\.finna\.fi', // Zotero cannot handle this data-base well at all + 'explore\.bl\.uk/primo_library', // Fails 100% + 'ezproxy', + 'facebook\.com', // login and junk + 'findarticles\.com', // Gone 'google\.com/search', // Google search results + 'info\.hazu\.hr', // Fails 100% 'jstor\.org/stable/', // We have special jstor code - 'ned\.ipac\.caltech\.edu', // Gives no real title - 'pep\-web\.org', // Does not parse very well at all - 'ezproxy', - 'arkive\.org', - 'bloomberg\.com/tosv2.html', // Junk - 'worldcat\.org', // Should use parameters and google instead + 'kijkcijferanalyse\.nl/', // highjacked 'kyobobook\.co\.kr', // Bookstore that give junk - 'facebook\.com', // login and junk 'leighrayment\.com', // highjacked - 'seapower\-digital\.com', // highjacked - 'scholarlycommons\.pacific\.edu\/euler\-works', // returns journal reference to works, not the actual work + 'lincstothepast\.com', 'miar\.ub\.edu\/issn', // ISSN description, not actually the journal - 'britishnewspaperarchive\.co\.uk', // Requires registration + 'myprivacy\.dpgmedia\.nl', // blocker + 'ned\.ipac\.caltech\.edu', // Gives no real title + 'olympics\.com', // Fails 100% + 'pep\-web\.org', // Does not parse very well at all 'pressreader\.com', // Bad titles - 'ebooks\.adelaide\.edu\.au', // Dead + 'radiomap\.eu', // Fails 100% + 'scholarlycommons\.pacific\.edu\/euler\-works', // returns journal reference to works, not the actual work + 'seapower\-digital\.com', // highjacked + 'startribune\.newspapers\.com', // Fails 100% + 'stats\.espncricinfo\.com', // Fails 100% + 'steubencourier\.com', // USA Today highjakced + 'swedishcharts\.com', // Fails 100% + 'timea\.rice\.edu', // gone 'tnmaps\.tn\.nic\.in', // maps - 'lincstothepast\.com', - 'elonet\.finna\.fi', // Zotero cannot handle this data-base well at all - 'washingtonpost\.com', // Fails 100% - 'cagematch\.net', // Fails 100% - 'www\.cbc\.ca', // Fails 100% - 'www\.emporis\.com', // Fails 100% + 'twitter\.', // This should be {{cite tweet}} 'usnews\.com', // Fails 100% - 'www\.ascap\.com', // Fails 100% - 'www\.thefutoncritic\.com', // Fails 100% - 'www\.bom\.gov\.au', // Fails 100% - 'www\.dailytelegraph\.com\.au', // Fails 100% - 'www\.pwinsider\.com', // Fails 100% - 'www\.wrecksite\.eu', // Fails 100% - 'www\.heraldsun\.com\.au', // Fails 100% - 'www\.neighbourhood\.statistics\.gov\.uk', // Fails 100% - 'www\.playbill\.com', // Fails 100% - 'olympics\.com', // Fails 100% - 'www\.legislation\.gov\.uk', // Fails 100% - 'stats\.espncricinfo\.com', // Fails 100% + 'washingtonpost\.com', // Fails 100% + 'weblio\.jp/', // just scrapes other website like wiki and has bogus issue and volume numbers + 'worldcat\.org', // Should use parameters and google instead 'www\.almasdarnews\.com', // Fails 100% - 'www\.encyclopedia\.com', // Fails 100% - 'swedishcharts\.com', // Fails 100% - 'www\.teamusa\.org', // Fails 100% + 'www\.ascap\.com', // Fails 100% + 'www\.billboard-japan\.com', // Fails 100% + 'www\.bom\.gov\.au', // Fails 100% + 'www\.cbc\.ca', // Fails 100% + 'www\.censusindia\.gov\.in', // Fails 100% 'www\.couriermail\.com\.au', // Fails 100% + 'www\.dailytelegraph\.com\.au', // Fails 100% + 'www\.emporis\.com', // Fails 100% + 'www\.encyclopedia\.com', // Fails 100% 'www\.faa\.gov', // Fails 100% + 'www\.heraldsun\.com\.au', // Fails 100% 'www\.hockeydb\.com', // Fails 100% - 'info\.hazu\.hr', // Fails 100% - 'australian-charts\.com', // Fails 100% - 'startribune\.newspapers\.com', // Fails 100% + 'www\.legislation\.gov\.uk', // Fails 100% + 'www\.neighbourhood\.statistics\.gov\.uk', // Fails 100% 'www\.olympic\.org', // Fails 100% - 'www\.billboard-japan\.com', // Fails 100% - 'www\.censusindia\.gov\.in', // Fails 100% - 'censusindia\.gov\.in', // Fails 100% - 'charts\.nz', // Fails 100% - 'radiomap\.eu', // Fails 100% + 'www\.playbill\.com', // Fails 100% + 'www\.pwinsider\.com', // Fails 100% + 'www\.teamusa\.org', // Fails 100% + 'www\.thefutoncritic\.com', // Fails 100% 'www\.virtualwall\.org', // Fails 100% - 'steubencourier\.com', // USA Today highjakced - 'biodiversity\.org\.au', // Zotero gives bad data - 'timea\.rice\.edu', // gone - 'findarticles\.com', // Gone - 'myprivacy\.dpgmedia\.nl', // blocker - 'explore\.bl\.uk/primo_library', // Fails 100% - 'weblio\.jp/', // just scrapes other website like wiki and has bogus issue and volume numbers - 'kijkcijferanalyse\.nl/', // highjacked + 'www\.wrecksite\.eu', // Fails 100% ]; const NON_JOURNAL_WEBSITES = [ - 'ourworldindata.org/', '-news.co.uk/', '.ajc.com/', '.al.com/', @@ -8326,6 +8325,7 @@ 'oursportscentral.com/', 'ourvmc.org/', 'ourwarwickshire.org.uk/', + 'ourworldindata.org/', 'out.com/', 'outils-odsef-fss.ulaval.ca/', 'outlawcountrycruise.com/', @@ -12234,117 +12234,117 @@ // bbm.ca is short enough that we add /bbm.ca/ and .bbm.ca/ since we do not want to grab too many sites const NON_JOURNAL_DOIS = [ - '10.17487/rfc', - '10.5531/db.vz.0001', '10.1163/2352-0248', + '10.17487/rfc', + '10.2139/ssrn', '10.3318/dib', '10.34667/tind.', - '10.2139/ssrn', + '10.5531/db.vz.0001', ]; // lowercase exact matches const NON_JOURNALS = [ 'Amphibian Species of the World', 'an Online Reference', 'An Online Reference', 'Boston Almanac and Guide1', - 'Includes:reports from Commissioners, Inspectors and Others', 'Digital Collections', + 'Includes:reports from Commissioners, Inspectors and Others', 'X (Formerly Twitter)', ]; // Case-sensitive sub-string const ARE_MAGAZINES = [ - 'the new yorker', - 'the new republic', - 'new republic', - 'expedition magazine', - 'wired', - 'wired uk', + 'billboard (magazine)', + 'billboard', 'computer gaming world', 'edge (magazine)', + 'electronic gaming monthly', + 'entertainment weekly', + 'expedition magazine', + 'famitsu', 'game informer', - 'pc gamer uk', - 'wired (magazine)', - 'time', - 'life', - 'time (magazine)', + 'gamepro', + 'games radar', + 'harper\'s bazaar', + 'harper\'s magazine', + 'harper\'s', + 'hyper', + 'kalki', 'life (magazine)', - 'billboard', - 'billboard (magazine)', - 'rolling stone', + 'life', 'mcv/develop', - 'vanity fair', + 'new civil engineer', + 'new republic', 'nintendo power', - 'playthings', - 'entertainment weekly', - 'official xbox magazine', - 'electronic gaming monthly', 'official u.s. playstation magazine', - 'playstation: the official magazine', + 'official xbox magazine', + 'pc gamer uk', 'play and silicon mag', - 'games radar', - 'hyper', - 'famitsu', - 'gamepro', - 'yachting world', - 'kalki', + 'playstation: the official magazine', + 'playthings', + 'rolling stone', 'sports illustrated', - 'new civil engineer', - 'harper\'s magazine', - 'harper\'s bazaar', - 'harper\'s', + 'the new republic', + 'the new yorker', + 'time (magazine)', + 'time', + 'vanity fair', + 'wired (magazine)', + 'wired uk', + 'wired', + 'yachting world', ]; // lowercase axact matches const ARE_MANY_THINGS = [ - 'pc gamer', - 'gamestar', - 'rock paper shotgun', - 'mcv', - 'rock, paper, shotgun', - 'edge', - 'ballotpedia', - 'npr', + 'audible.com', + 'audible', 'ballotpedia.org', - 'npr.org', - 'nih.gov', - 'nih', - 'eurogamer.it', + 'ballotpedia', + 'brema.suub.uni-bremen.de', 'conceptcarz', - 'the royal family', - 'eurogamer.de', 'east west main line partnership', - 'national institutes of health', - 'national institutes of health (nih)', - 'www.finna.fi', - 'finna.fi', + 'edge', 'elonet', - 'audible.com', - 'audible', + 'eurogamer.de', + 'eurogamer.it', + 'finna.fi', + 'gamestar', + 'mcv', + 'national institutes of health (nih)', + 'national institutes of health', + 'nih.gov', + 'nih', + 'npr.org', + 'npr', + 'pc gamer', + 'rock paper shotgun', + 'rock, paper, shotgun', 'staats und universitätsbibliothek bremen', - 'brema.suub.uni-bremen.de', + 'the royal family', 'usdakotawar.org', + 'www.finna.fi', ]; // lowercase axact matches. These are things that are both websites and newspapers const ARE_NEWSPAPERS = [ - 'the economist', 'la times', - 'toronto sun', - 'washington post', - 'the washington post', 'philippine daily inquirer', + 'the economist', 'the irish times', - 'wikinews', + 'the washington post', + 'toronto sun', + 'washington post', 'wikinews.org', + 'wikinews', ]; // lowercase axact matches const NO_PUBLISHER_NEEDED = [ + 'forbes magazine', + 'forbes.com', + 'huffington post', 'los angeles times', 'new york times magazine', - 'the new york times', 'new york times', - 'huffington post', 'the daily telegraph', - 'forbes.com', - 'forbes magazine', + 'the new york times', ]; // lowercase axact matches const ENCYCLOPEDIA_WEB = [ - 'plato.stanford.edu', 'britannica.com', + 'plato.stanford.edu', ]; const GOOD_10_1093_DOIS = [ @@ -12799,40 +12799,40 @@ // List of things to not print links to, since they occur all the time const AVOIDED_LINKS = [ '', - 'Digital_object_identifier', - 'JSTOR', - 'Website', - 'International_Standard_Book_Number', - 'Library_of_Congress_Control_Number', - 'Handle_System', - 'PubMed_Central', - 'PubMed', - 'PubMed_Identifier', + 'ArXiv_(identifier)', + 'ArXiv', + 'Bibcode_(identifier)', 'Bibcode', - 'International_Standard_Serial_Number', 'bioRxiv', + 'Cf.', 'CiteSeerX', - 'Zentralblatt_MATH', + 'Digital_object_identifier', + 'Doi_(identifier)', + 'Handle_System', + 'International_Standard_Book_Number', + 'International_Standard_Serial_Number', + 'ISBN_(identifier)', + 'ISSN_(identifier)', 'Jahrbuch_über_die_Fortschritte_der_Mathematik', + 'JSTOR', + 'Library_of_Congress_Control_Number', 'Mathematical_Reviews', + 'OCLC_(identifier)', + 'OCLC', 'Office_of_Scientific_and_Technical_Information', - 'Request_for_Comments', - 'Social_Science_Research_Network', - 'Zentralblatt_MATH', 'Open_Library', - 'ArXiv', - 'OCLC', - 'Cf.', - 'Doi_(identifier)', + 'OSTI_(identifier)', 'PMC_(identifier)', 'PMID_(identifier)', - 'ArXiv_(identifier)', - 'Bibcode_(identifier)', + 'PubMed_Central', + 'PubMed_Identifier', + 'PubMed', + 'Request_for_Comments', 'S2CID_(identifier)', - 'ISBN_(identifier)', - 'ISSN_(identifier)', - 'OCLC_(identifier)', - 'OSTI_(identifier)', + 'Social_Science_Research_Network', + 'Website', + 'Zentralblatt_MATH', + 'Zentralblatt_MATH', ]; // Lower case, and periods and dashes converted to spaces @@ -12899,28 +12899,28 @@ const COMPARE_SERIES_OUT = [' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', 'adv ', 'exp', 'pharmacol', 'meth ', 'immunol', 'meth ', 'mol', 'med', 'bio', ' ', 'enzymol', 'bio', ' ', ' ', 'embryol', 'anat']; const ALWAYS_BAD_TITLES = [ + '{title}', 'Bloomberg - Are you a robot?', - 'Page not found', - 'Breaking News, Analysis, Politics, Blogs, News Photos, Video, Tech Reviews', 'Breaking News, Analysis, Politics, Blogs, News Photos, Video, Tech Reviews - TIME.com', - 'Register | British Newspaper Archive', + 'Breaking News, Analysis, Politics, Blogs, News Photos, Video, Tech Reviews', + 'DPG Media Privacy Gate', + 'How to access research remotely', + 'Log In - ProQuest', + 'Page not found', + 'PressReader.com - Connecting People Through News.', + 'PressReader.com - Connecting People Through News', + 'PressReader.com - Digital Newspaper & Magazine Subscriptions.', + 'PressReader.com - Digital Newspaper & Magazine Subscriptions', 'PressReader.com - Your favorite newspapers and magazines.', 'PressReader.com - Your favorite newspapers and magazines', - 'PressReader.com - Connecting People Through News', - 'PressReader.com - Connecting People Through News.', - 'PressReader.com – Your favorite newspapers and magazines.', - 'PressReader.com – Your favorite newspapers and magazines', - 'PressReader.com – Connecting People Through News', 'PressReader.com – Connecting People Through News.', - 'PressReader.com - Digital Newspaper & Magazine Subscriptions', - 'PressReader.com - Digital Newspaper & Magazine Subscriptions.', + 'PressReader.com – Connecting People Through News', 'PressReader.com – Digital Newspaper & Magazine Subscriptions', - 'How to access research remotely', - 'Log In - ProQuest', - 'DPG Media Privacy Gate', - 'Request Rejected', - '{title}', + 'PressReader.com – Your favorite newspapers and magazines.', + 'PressReader.com – Your favorite newspapers and magazines', 'Radware Bot Manager Captcha', + 'Register | British Newspaper Archive', + 'Request Rejected', ]; const DOI_FREE_PREFIX = [ @@ -13091,11 +13091,11 @@ ]; const BAD_DOI_ARRAY = [ - '10.1126/science' => true, '' => true, - '10.7556/jaoa' => true, - '10.1267/science.040579197' => true, + '10.0000/Rubbish_bot_failure_test.x' => true, '10.0000/Rubbish_bot_failure_test' => true, '10.0000/Rubbish_bot_failure_test2' => true, - '10.0000/Rubbish_bot_failure_test.x' => true, + '10.1126/science' => true, + '10.1267/science.040579197' => true, + '10.7556/jaoa' => true, ];