diff --git a/Template.php b/Template.php index f5cad6a8ae..2fe9d91492 100644 --- a/Template.php +++ b/Template.php @@ -15,6 +15,125 @@ require_once 'NameTools.php'; // @codeCoverageIgnoreEnd + + + +const REJECT_NEW = ['null', 'n/a', 'undefined', '0 0', '(:none)', '-']; +const GOOFY_TITLES = ['Archived copy', "{title}", 'ScienceDirect', 'Google Books', 'None', 'usurped title']; +const BAD_NEW_PAGES = ['0', '0-0', '0–0']; +const BAD_ISBN = ['9780918678072', '978-0-918678-07-2', '0918678072', '0-918678-07-2']; +const SHORT_STRING = ['the', 'and', 'a', 'for', 'in', 'on', 's', 're', 't', 'an', 'as', 'at', 'and', 'but', 'how', 'why', 'by', 'when', 'with', 'who', 'where', '']; +const RIS_IS_BOOK = ['CHAP', 'BOOK', 'EBOOK', 'ECHAP', 'EDBOOK', 'DICT', 'ENCYC', 'GOVDOC']; +const RIS_IS_FULL_BOOK = ['BOOK', 'EBOOK', 'EDBOOK']; +const + +GOOD_FREE +['publisher', 'projectmuse', 'have free'] + +BAD_OA_URL +['10.4135/9781529742343', '10.1017/9781108859745'] + +ENGLISH_WIKI +['en', 'simple', 'mdwiki'] + +REMOVE_SEMI +['date', 'year', 'location', 'publisher', 'issue', 'number', 'page', 'pages', 'pp', 'p', 'volume'] + +REMOVE_PERIOD +['date', 'year', 'issue', 'number', 'page', 'pages', 'pp', 'p', 'volume'] + +LINK_LIST +['authorlink', 'chapterlink', 'contributorlink', 'editorlink', 'episodelink', 'interviewerlink', 'inventorlink', 'serieslink', 'subjectlink', 'titlelink', 'translatorlink'] + +BAD_AGENT +['United States Food and Drug Administration', 'Surgeon General of the United States', 'California Department of Public Health'] + +BAD_AGENT_PUBS +['United States Department of Health and Human Services', 'California Tobacco Control Program', '' + +NO_LANGS +['n', 'no', 'live', 'alive', 'কার্যকর', 'hayır', 'não', 'nao', 'false'] + +YES_LANGS +['y', 'yes', 'dead', 'si', 'sì', 'ja', 'evet', 'ei tööta', 'sim', 'ano', 'true'] + +PDF_LINKS +['pdf', 'portable document format', '[[portable document format|pdf]]', '[[portable document format]]', '[[pdf]]'] + +DEPARMENTS + [ + 'local', + 'editorial', + 'international', + 'national', + 'communication', + 'letter to the editor', + 'review', + 'coronavirus', + 'race & reckoning', + 'politics', + 'opinion', + 'opinions', + 'investigations', + 'tech', + 'technology', + 'world', + 'sports', + 'world', + 'arts & entertainment', + 'arts', + 'entertainment', + 'u.s.', + 'n.y.', + 'business', + 'science', + 'health', + 'books', + 'style', + 'food', + 'travel', + 'real estate', + 'magazine', + 'economy', + 'markets', + 'life & arts', + 'uk news', + 'world news', + 'health news', + 'lifestyle', + 'photos', + 'education', + 'arts', + 'life', + 'puzzles', + ] + + +BAD_VIA + [ + '', + 'project muse', + 'wiley', + 'springer', + 'questia', + 'elsevier', + 'wiley online library', + 'wiley interscience', + 'interscience', + 'sciencedirect', + 'science direct', + 'ebscohost', + 'proquest', + 'google scholar', + 'google', + 'bing', + 'yahoo', + ] + + + VOL_NUM + ['volume', 'issue', 'number'] + final class Template { public const PLACEHOLDER_TEXT = '# # # CITATION_BOT_PLACEHOLDER_TEMPLATE %s # # #'; @@ -1164,7 +1283,7 @@ public function add_if_new(string $param_name, string $value, string $api = ''): } $low_value = strtolower($value); - if (in_array($low_value, ['null', 'n/a', 'undefined', '0 0', '(:none)', '-'], true)) { + if (in_array($low_value, REJECT_NEW, true)) { // Hopeully name is not actually null return false; } @@ -1963,7 +2082,7 @@ public function add_if_new(string $param_name, string $value, string $api = ''): } if ( $this->blank($param_name) || - in_array($this->get($param_name), ['Archived copy', "{title}", 'ScienceDirect', 'Google Books', 'None', 'usurped title'], true) || + in_array($this->get($param_name), GOOFY_TITLES, true) || (stripos($this->get($param_name), 'EZProxy') !== false && stripos($value, 'EZProxy') === false) ) { foreach (['encyclopedia', 'encyclopaedia', 'work', 'dictionary', 'journal'] as $worky) { @@ -2088,9 +2207,9 @@ public function add_if_new(string $param_name, string $value, string $api = ''): case "page": case "pages": - if (in_array($value, ['0', '0-0', '0–0'], true)) { + if (in_array($value, BAD_NEW_PAGES, true)) { return false; - } // Reject bogus zero page number + } if ($this->has('at') || $this->has('article-number')) { return false; } // Leave at= alone. People often use that for at=See figure 17 on page...... @@ -2443,9 +2562,9 @@ public function add_if_new(string $param_name, string $value, string $api = ''): return false; case 'isbn': - if (in_array($value, ['9780918678072', '978-0-918678-07-2', '0918678072', '0-918678-07-2'], true)) { + if (in_array($value, BAD_ISBN, true)) { return false; - } // Not a good one + } if ($this->blank($param_name)) { $value = $this->isbn10Toisbn13($value); if (strlen($value) === 13 && substr($value, 0, 6) === '978019') { @@ -2953,7 +3072,7 @@ private function do_pumbed_query(array $terms): array $data = strip_diacritics($data); $data_array = explode(" ", $data); foreach ($data_array as $val) { - if (!in_array(strtolower($val), ['the', 'and', 'a', 'for', 'in', 'on', 's', 're', 't', 'an', 'as', 'at', 'and', 'but', 'how', 'why', 'by', 'when', 'with', 'who', 'where', ''], true) && mb_strlen($val) > 3) { + if (!in_array(strtolower($val), SHORT_STRING, true) && mb_strlen($val) > 3) { // Small words are NOT indexed $query .= " AND (" . str_replace("%E2%80%93", "-", urlencode($val)) . "[{$key}])"; } @@ -3200,7 +3319,7 @@ public function expand_by_adsabs(): void report_info("Database entry not complete"); // @codeCoverageIgnore return; // @codeCoverageIgnore } - if ($this->has('title') && titles_are_dissimilar($this->get('title'), $record->title[0]) && !in_array($this->get('title'), ['Archived copy', "{title}", 'ScienceDirect', "Google Books", "None", 'usurped title'], true)) { + if ($this->has('title') && titles_are_dissimilar($this->get('title'), $record->title[0]) && !in_array($this->get('title'), GOOFY_TITLES, true)) { // Verify the title matches. We get some strange mis-matches { report_info("Similar title not found in database"); // @codeCoverageIgnore return; // @codeCoverageIgnore @@ -3323,10 +3442,10 @@ public function expand_by_RIS(string &$dat, bool $add_url): void $ris_part[0] = ""; } // Ignore if (trim($ris_part[0]) === "TY") { - if (in_array(trim($ris_part[1]), ['CHAP', 'BOOK', 'EBOOK', 'ECHAP', 'EDBOOK', 'DICT', 'ENCYC', 'GOVDOC'], true)) { + if (in_array(trim($ris_part[1]), RIS_IS_BOOK, true)) { $ris_book = true; // See https://en.wikipedia.org/wiki/RIS_(file_format)#Type_of_reference } - if (in_array(trim($ris_part[1]), ['BOOK', 'EBOOK', 'EDBOOK'], true)) { + if (in_array(trim($ris_part[1]), RIS_IS_FULL_BOOK, true)) { $ris_fullbook = true; } } elseif (trim($ris_part[0]) === "T2") { @@ -3525,7 +3644,7 @@ public function get_open_access_url(): void return; } $return = $this->get_unpaywall_url($doi); - if (in_array($return, ['publisher', 'projectmuse', 'have free'], true)) { + if (in_array($return, GOOD_FREE, true)) { return; } // Do continue on $this->get_semanticscholar_url($doi); @@ -3564,7 +3683,7 @@ public function get_unpaywall_url(string $doi): string if ($ch_oa === null) { $ch_oa = bot_curl_init(0.5, [CURLOPT_USERAGENT => BOT_CROSSREF_USER_AGENT]); } - if (in_array($doi, ['10.4135/9781529742343', '10.1017/9781108859745'], true)) { + if (in_array($doi, BAD_OA_URL, true)) { return 'wrong'; } // TODO - maybe all ISBN set_time_limit(120); @@ -3771,7 +3890,7 @@ public function get_unpaywall_url(string $doi): string public function clean_google_books(): void { - if (!in_array(WIKI_BASE, ['en', 'simple', 'mdwiki'], true)) { // TODO - support other countries + if (!in_array(WIKI_BASE, ENGLISH_WIKI, true)) { // TODO - support other countries return; } foreach (ALL_URL_TYPES as $url_type) { @@ -5127,11 +5246,11 @@ public function tidy_parameter(string $param): void $this->set($param, safe_preg_replace('~(?get($param))); // &Amp; => & but not if next character is & or previous character is ; // Remove final semi-colon from a few items - if ((in_array($param, ['date', 'year', 'location', 'publisher', 'issue', 'number', 'page', 'pages', 'pp', 'p', 'volume'], true) || in_array($param, FLATTENED_AUTHOR_PARAMETERS, true)) && strpos($this->get($param), '&') === false) { + if ((in_array($param, REMOVE_SEMI, true) || in_array($param, FLATTENED_AUTHOR_PARAMETERS, true)) && strpos($this->get($param), '&') === false) { $this->set($param, safe_preg_replace('~;$~u', '', $this->get($param))); } // Remove final period from a few items - if (in_array($param, ['date', 'year', 'issue', 'number', 'page', 'pages', 'pp', 'p', 'volume'], true)) { + if (in_array($param, REMOVE_PERIOD, true)) { if (preg_match('~^(\d+)\.$~', $this->get($param), $match)) { $this->set($param, $match[1]); } @@ -5174,7 +5293,7 @@ public function tidy_parameter(string $param): void if ( in_array( str_replace(['-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], '', strtolower($param)), - ['authorlink', 'chapterlink', 'contributorlink', 'editorlink', 'episodelink', 'interviewerlink', 'inventorlink', 'serieslink', 'subjectlink', 'titlelink', 'translatorlink'], + LINK_LIST, true ) && $this->has($param) && @@ -5211,8 +5330,8 @@ public function tidy_parameter(string $param): void case 'agency': if ( - in_array($this->get('agency'), ['United States Food and Drug Administration', 'Surgeon General of the United States', 'California Department of Public Health'], true) && - in_array($this->get('publisher'), ['United States Department of Health and Human Services', 'California Tobacco Control Program', ''], true) + in_array($this->get('agency'), BAD_AGENT, true) && + in_array($this->get('publisher'), BAD_AGENT_PUBS, true) ) { $this->forget('publisher'); $this->rename('agency', 'publisher'); // A single user messed this up on a lot of pages with "agency" @@ -5499,10 +5618,10 @@ public function tidy_parameter(string $param): void case 'dead-url': case 'deadurl': $the_data = mb_strtolower($this->get($param)); - if (in_array($the_data, ['y', 'yes', 'dead', 'si', 'sì', 'ja', 'evet', 'ei tööta', 'sim', 'ano'], true)) { + if (in_array($the_data, YES_LANGS, true)) { $this->rename($param, 'url-status', 'dead'); $this->forget($param); - } elseif (in_array($the_data, ['n', 'no', 'live', 'alive', 'কার্যকর', 'hayır', 'não', 'nao'], true)) { + } elseif (in_array($the_data, NO_LANGS, true)) { $this->rename($param, 'url-status', 'live'); $this->forget($param); } elseif (in_array($the_data, ['', 'bot: unknown'], true)) { @@ -5515,10 +5634,10 @@ public function tidy_parameter(string $param): void case 'arşivengelli': // "ignore archive" $the_data = mb_strtolower($this->get($param)); - if (in_array($the_data, ['y', 'yes', 'evet'], true)) { + if (in_array($the_data, YES_LANGS, true)) { $this->rename($param, 'url-status', 'live'); $this->forget($param); - } elseif (in_array($the_data, ['n', 'no', 'hayır'], true)) { + } elseif (in_array($the_data, NO_LANGS, true)) { $this->rename($param, 'url-status', 'dead'); $this->forget($param); } elseif (in_array($the_data, ['', 'bot: unknown'], true)) { @@ -5530,9 +5649,9 @@ public function tidy_parameter(string $param): void case 'url-status': $the_data = mb_strtolower($this->get($param)); - if (in_array($the_data, ['y', 'yes', 'si', 'sì', 'ei tööta'], true)) { + if (in_array($the_data, YES_LANGS, true)) { $this->set($param, 'dead'); - } elseif (in_array($the_data, ['n', 'no', 'alive', 'কার্যকর'], true)) { + } elseif (in_array($the_data, NO_LANGS, true)) { $this->set($param, 'live'); } return; @@ -5546,11 +5665,11 @@ public function tidy_parameter(string $param): void case 'last-author-amp': case 'lastauthoramp': $the_data = mb_strtolower($this->get($param)); - if (in_array($the_data, ['n', 'no', 'false'], true)) { + if (in_array($the_data, NO_LANGS, true)) { $this->forget($param); return; } - if (in_array($the_data, ['y', 'yes', 'true'], true)) { + if (in_array($the_data, YES_LANGS, true)) { $this->rename($param, 'name-list-style', 'amp'); $this->forget($param); } @@ -5873,7 +5992,7 @@ public function tidy_parameter(string $param): void $this->forget($param); } // Citation templates do this automatically -- also remove if there is no url - if (in_array(strtolower($this->get($param)), ['pdf', 'portable document format', '[[portable document format|pdf]]', '[[portable document format]]', '[[pdf]]'], true)) { + if (in_array(strtolower($this->get($param)), PDF_LINKS, true)) { if ($this->blank('url') || strtolower(substr($this->get('url'), -4)) === '.pdf') { $this->forget($param); } @@ -5886,7 +6005,7 @@ public function tidy_parameter(string $param): void $this->forget($param); } // Citation templates do this automatically -- also remove if there is no url, which is template error - if (in_array(strtolower($this->get($param)), ['pdf', 'portable document format', '[[portable document format|pdf]]', '[[portable document format]]'], true)) { + if (in_array(strtolower($this->get($param)), PDF_LINKS, true)) { if ($this->has('chapter-url')) { if (substr($this->get('chapter-url'), -4) === '.pdf' || substr($this->get('chapter-url'), -4) === '.PDF') { $this->forget($param); @@ -6527,56 +6646,7 @@ public function tidy_parameter(string $param): void } } if ( - in_array( - strtolower($this->get('work')), - [ - 'local', - 'editorial', - 'international', - 'national', - 'communication', - 'letter to the editor', - 'review', - 'coronavirus', - 'race & reckoning', - 'politics', - 'opinion', - 'opinions', - 'investigations', - 'tech', - 'technology', - 'world', - 'sports', - 'world', - 'arts & entertainment', - 'arts', - 'entertainment', - 'u.s.', - 'n.y.', - 'business', - 'science', - 'health', - 'books', - 'style', - 'food', - 'travel', - 'real estate', - 'magazine', - 'economy', - 'markets', - 'life & arts', - 'uk news', - 'world news', - 'health news', - 'lifestyle', - 'photos', - 'education', - 'arts', - 'life', - 'puzzles', - ], - true - ) && + in_array(strtolower($this->get('work')), DEPARMENTS, true) && $this->blank('department') ) { $this->rename('work', 'department'); @@ -7466,29 +7536,7 @@ public function tidy_parameter(string $param): void ) { $via = trim(str_replace(['[', ']'], '', strtolower($this->get('via')))); if ( - in_array( - $via, - [ - '', - 'project muse', - 'wiley', - 'springer', - 'questia', - 'elsevier', - 'wiley online library', - 'wiley interscience', - 'interscience', - 'sciencedirect', - 'science direct', - 'ebscohost', - 'proquest', - 'google scholar', - 'google', - 'bing', - 'yahoo', - ], - true - ) + in_array($via, BAD_VIA, true) ) { $this->forget('via'); return; @@ -9008,7 +9056,7 @@ public function modifications(): array $no_dash_to_start = false; } } - if (in_array($old_name, ['volume', 'issue', 'number'], true)) { + if (in_array($old_name, VOL_NUM, true)) { if (strpos($old_data, '-') !== false) { $no_dash_to_start = false; } @@ -9101,7 +9149,7 @@ private function volume_issue_demix(string $data, string $param): void if ($param === 'year') { return; } - if (!in_array($param, ['volume', 'issue', 'number'], true)) { + if (!in_array($param, VOL_NUM, true)) { report_error('volume_issue_demix ' . echoable($param)); // @codeCoverageIgnore } if (in_array($this->wikiname(), ['cite encyclopaedia', 'cite encyclopedia', 'cite book'], true)) {