Skip to content

Commit

Permalink
Merge branch 'master' into PHP-8
Browse files Browse the repository at this point in the history
  • Loading branch information
GlazerMann authored Jul 27, 2023
2 parents f99fb5b + 43478c9 commit de6809b
Show file tree
Hide file tree
Showing 23 changed files with 570 additions and 256 deletions.
3 changes: 2 additions & 1 deletion Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -726,13 +726,14 @@ public function extract_object(string $class) : array {
$this->page_error = TRUE;
report_warning('Regular expression failure in ' . echoable($this->title) . ' when extracting ' . $class . 's');
if ($class === "Template") {
echo "<p>\n\n The following text might help you figure out where the <b>error on the page</b> is (Look for lone { and } characters)</h1>\n\n" . echoable($text) . "\n\n<p>";
echo "<p><h3>\n\n The following text might help you figure out where the <b>error on the page</b> is (Look for lone { and } characters)</h3>\n<h4> If that is not the problem, then run the single page with &prce=1 added to the URL to change the parsing engine</h4>\n" . echoable($text) . "\n\n<p>";
}
if (TRAVIS) {
report_error("Critical Error on page: " . $this->title);
} else {
report_warning("Either page is too big and complex or there is an error with { and } characters balancing out.");
}
gc_collect_cycles();
// @codeCoverageIgnoreEnd
}
$this->text = $text;
Expand Down
90 changes: 78 additions & 12 deletions Template.php
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ public function prepare() : void {
}
if (stripos($the_journal, 'Advances in Cryptology') === 0 ||
stripos($the_journal, 'IEEE Symposium') !== FALSE ||
stripos($the_journal, 'IEEE Conference') !== FALSE ||
stripos($the_journal, 'IEEE International Conference') !== FALSE ) {
$this->rename('journal', 'CITATION_BOT_PLACEHOLDER_journal');
$the_journal = '';
Expand All @@ -365,22 +366,54 @@ public function prepare() : void {
$the_chapter = '';
}
}
if ($the_pages === '0' || $the_pages === 'null' || $the_pages === 'n/a' || $the_pages === 'online' || $the_pages === 'Online' || $the_pages === 'Forthcoming' || $the_pages === 'forthcoming') {
if (strpos($this->get('doi'), '10.1109/') === 0 && $this->has('isbn')) { // IEEE "book"
$data_to_check = $the_title . $the_journal . $the_chapter . $this->get('series');
if (stripos($data_to_check, 'IEEE Standard for') === FALSE && $this->blank('journal')) {

Check warning on line 371 in Template.php

View check run for this annotation

Codecov / codecov/patch

Template.php#L371

Added line #L371 was not covered by tests
; // Do nothing
} elseif (stripos($data_to_check, 'Symposium') === FALSE && stripos($data_to_check, 'Conference') === FALSE) { // Looks like conference

Check warning on line 373 in Template.php

View check run for this annotation

Codecov / codecov/patch

Template.php#L373

Added line #L373 was not covered by tests
if ($the_journal !== '') {
$this->rename('journal', 'CITATION_BOT_PLACEHOLDER_journal');
$the_journal = '';
}
if ($the_title !== '') {
$this->rename('title', 'CITATION_BOT_PLACEHOLDER_title');
$the_title = '';
}
if ($the_chapter !== '') {
$this->rename('chapter', 'CITATION_BOT_PLACEHOLDER_chapter');
$the_chapter = '';
}
$bad_data = TRUE;
} elseif (stripos($the_journal, 'Symposium') !== FALSE || stripos($the_journal, 'Conference') !== FALSE) {
$this->rename('journal', 'CITATION_BOT_PLACEHOLDER_journal');
$the_journal = '';
$bad_data = TRUE;
if ($the_title !== '') {
$this->rename('title', 'CITATION_BOT_PLACEHOLDER_title');
$the_title = '';
}
if ($the_chapter !== '') {
$this->rename('chapter', 'CITATION_BOT_PLACEHOLDER_chapter');
$the_chapter = '';
}
}
}
if ($the_pages === '_' || $the_pages === '0' || $the_pages === 'null' || $the_pages === 'n/a' || $the_pages === 'online' || $the_pages === 'Online' || $the_pages === 'Forthcoming' || $the_pages === 'forthcoming') {
$this->rename('pages', 'CITATION_BOT_PLACEHOLDER_pages');
$the_pages = '';
$bad_data = TRUE;
}
if ($the_page === '0' || $the_page === 'null' || $the_page === 'n/a' || $the_page === 'online' || $the_page === 'Online' || $the_page === 'Forthcoming' || $the_page === 'forthcoming') {
if ($the_page === '_' || $the_page === '0' || $the_page === 'null' || $the_page === 'n/a' || $the_page === 'online' || $the_page === 'Online' || $the_page === 'Forthcoming' || $the_page === 'forthcoming') {
$this->rename('page', 'CITATION_BOT_PLACEHOLDER_page');
$the_page = '';
$bad_data = TRUE;
}
if ($the_volume === '0' || $the_volume === 'null' || $the_volume === 'n/a' || $the_volume === 'Online edition' || $the_volume === 'online' || $the_volume === 'Online' || $the_volume === 'in press' || $the_volume === 'In press' || $the_volume === 'ahead-of-print' || $the_volume === 'Forthcoming' || $the_volume === 'forthcoming') {
if ($the_volume === '_' || $the_volume === '0' || $the_volume === 'null' || $the_volume === 'n/a' || $the_volume === 'Online edition' || $the_volume === 'online' || $the_volume === 'Online' || $the_volume === 'in press' || $the_volume === 'In press' || $the_volume === 'ahead-of-print' || $the_volume === 'Forthcoming' || $the_volume === 'forthcoming') {
$this->rename('volume', 'CITATION_BOT_PLACEHOLDER_volume');
$the_volume = '';
$bad_data = TRUE;
}
if ($the_issue === '0' || $the_issue === 'null' || $the_issue === 'ja' || $the_issue === 'n/a' || $the_issue === 'Online edition' || $the_issue === 'online' || $the_issue === 'Online' || $the_issue === 'in press' || $the_issue === 'In press' || $the_issue === 'ahead-of-print' || $the_issue === 'Forthcoming' || $the_issue === 'forthcoming') {
if ($the_issue === '_' || $the_issue === '0' || $the_issue === 'null' || $the_issue === 'ja' || $the_issue === 'n/a' || $the_issue === 'Online edition' || $the_issue === 'online' || $the_issue === 'Online' || $the_issue === 'in press' || $the_issue === 'In press' || $the_issue === 'ahead-of-print' || $the_issue === 'Forthcoming' || $the_issue === 'forthcoming') {
$this->rename('issue', 'CITATION_BOT_PLACEHOLDER_issue');
$the_issue = '';
$bad_data = TRUE;
Expand All @@ -397,9 +430,9 @@ public function prepare() : void {
$the_title = '';
$bad_data = TRUE;
}
if ($the_title === 'null' || $the_title === '[No title found]' || $the_title === 'Archived copy' || $the_title === 'JSTOR' ||
if ($the_title === '_' || $the_title === 'null' || $the_title === '[No title found]' || $the_title === 'Archived copy' || $the_title === 'JSTOR' ||
$the_title === 'ShieldSquare Captcha' || $the_title === 'Shibboleth Authentication Request' || $the_title === 'Pubmed' ||
$the_title === 'Pubmed Central') { // title=none is often because title is "reviewed work....
$the_title === 'Pubmed Central' || $the_title === 'Optica Publishing Group') { // title=none is often because title is "reviewed work....
$this->rename('title', 'CITATION_BOT_PLACEHOLDER_title');
$the_title = '';
$bad_data = TRUE;
Expand All @@ -423,7 +456,7 @@ public function prepare() : void {
$the_journal = '';
$bad_data = TRUE;
}
if (str_i_same($the_journal, 'JSTOR')) {
if (str_i_same($the_journal, 'JSTOR') || $the_journal === '_') {
$this->rename('journal', 'CITATION_BOT_PLACEHOLDER_journal');
$the_journal = '';
$bad_data = TRUE;
Expand All @@ -444,7 +477,12 @@ public function prepare() : void {
$this->rename('journal', 'CITATION_BOT_PLACEHOLDER_journal');
$the_journal = '';
$bad_data = TRUE;
}
}
if ($the_chapter === '_') {
$this->rename('chapter', 'CITATION_BOT_PLACEHOLDER_chapter');
$the_chapter = '';
$bad_data = TRUE;
}
if ($the_title !== '' && stripos($the_title, 'CITATION') === FALSE) {
if (str_i_same($the_title, $the_journal) &&
str_i_same($the_title, $the_chapter)) { // Journal === Title === Chapter INSANE! Never actually seen
Expand Down Expand Up @@ -1816,7 +1854,7 @@ public function get_doi_from_crossref() : bool {
return FALSE;
}
// They already allow some fuzziness in matches
if ($data['journal'] || $data['issn']) {
if (($data['journal'] || $data['issn']) && ($data['start_page'] || $data['author'])) {
$url = "https://www.crossref.org/openurl/?noredirect=TRUE&pid=" . CROSSREFUSERNAME
. ($data['title'] ? "&atitle=" . urlencode($data['title']) : '')
. ($data['author'] ? "&aulast=" . urlencode($data['author']) : '')
Expand All @@ -1836,7 +1874,7 @@ public function get_doi_from_crossref() : bool {
}
$result = $result->query_result->body->query;
if ((string) $result->attributes()->status === 'malformed') {
report_warning("Cannot search CrossRef: " . echoable((string) $result->msg)); // @codeCoverageIgnore
report_minor_error("Cannot search CrossRef: " . echoable((string) $result->msg)); // @codeCoverageIgnore
} elseif ((string) $result->attributes()->status === "resolved") {
if (!isset($result->doi)) return FALSE;
report_info(" Successful!");
Expand Down Expand Up @@ -3071,7 +3109,8 @@ protected function use_unnamed_params() : void {
// remove leading spaces or hyphens (which may have been typoed for an equals)
if (preg_match("~^[ -+]*(.+)~", (string) substr($dat, strlen($closest)), $match)) { // Cast FALSE to string
$this->add_if_new($closest, $match[1]/* . " [$shortest / $comp = $shortish]"*/);
$dat = trim(preg_replace('~^.*' . preg_quote($match[1]) . '~', '', $dat));
$replace_pos = strrpos($dat, $match[1]) + strlen($match[1]);
$dat = trim(substr($dat, $replace_pos));
}
} elseif (preg_match("~(?<!\d)(\d{10})(?!\d)~", str_replace(Array(" ", "-"), "", $dat), $match)) {
$the_isbn = str_split($match[1]);
Expand Down Expand Up @@ -4013,7 +4052,7 @@ public function tidy_parameter(string $param) : void {
}
if (!doi_works($doi)) {
if (preg_match('~^10.1093\/oi\/authority\.\d{10,}$~', $doi) &&
preg_match('~(?:oxfordreference\.com|oxfordindex\.oup\.com)\/view\/10.1093\/oi\/authority\.\d{10,}~', $this->get('url'))) {
preg_match('~(?:oxfordreference\.com|oxfordindex\.oup\.com)\/[^\/]+\/10.1093\/oi\/authority\.\d{10,}~', $this->get('url'))) {
$this->forget('doi');
return;
} elseif (preg_match('~^10\.1093\/law\:epil\/9780199231690\/law\-9780199231690~', $doi) &&
Expand Down Expand Up @@ -5787,6 +5826,14 @@ public function tidy_parameter(string $param) : void {
$this->forget('author');
}
}
if ( strtolower($the_param) === 'www.sciencedirect.com' ||
strtolower($the_param) === 'sciencedirect.com' ||
strtolower($the_param) === 'sciencedirect'
) {
if ($this->has('isbn')) {
$this->forget($param);
}
}
return;

case 'location':
Expand Down Expand Up @@ -5856,6 +5903,11 @@ public function final_tidy() : void {
$this->forget('series');
}
}
if ($this->has('journal') && str_equivalent($this->get('title'), $this->get('journal'))) {
if ($this->wikiname() === 'cite book' || $this->has('isbn')) {
$this->forget('journal');
}
}
// Double check these troublesome "journals"
if (($this->is_book_series('journal') || $this->is_book_series('series') ||
$this->is_book_series('chapter') || $this->is_book_series('title')) ||
Expand Down Expand Up @@ -6046,6 +6098,19 @@ public function final_tidy() : void {
}
}
}
if (preg_match('~^10\.1109/~', $this->get('doi')) &&
$this->has('title') && $this->has('chapter') && $this->has('isbn') &&
$this->wikiname() === 'cite book' && doi_works($this->get('doi'))) {
if (stripos($this->get('title'), 'Conference') !== FALSE) {
if (stripos($this->get('website'), 'Conference') !== FALSE) $this->forget('website');
if (stripos($this->get('journal'), 'Conference') !== FALSE) $this->forget('journal');
}
if (stripos($this->get('title'), 'Symposium') !== FALSE) {
if (stripos($this->get('website'), 'Symposium') !== FALSE) $this->forget('website');
if (stripos($this->get('journal'), 'Symposium') !== FALSE) $this->forget('journal');
}
}

$this->tidy_parameter('doi'); // might be free, and freedom is date dependent for some journals
if ($this->blank(PAGE_ALIASES) && preg_match('~^10\.1103\/[a-zA-Z]+\.(\d+)\.(\d+)$~', $this->get('doi'), $matches)) {
if ($matches[1] === $this->get('volume')) {
Expand Down Expand Up @@ -6784,6 +6849,7 @@ protected function volume_issue_demix(string $data, string $param) : void {
preg_match("~^(\d+) \((\d+ Suppl \d+)\)$~i", $data, $matches) ||
preg_match("~^Vol\.?(\d+)\((\d+)\)$~", $data, $matches) ||
preg_match("~^(\d+) +\(No(?:\.|\. | )(\d+)\)$~i", $data, $matches) ||
preg_match("~^(\d+):(\d+)$~", $data, $matches) ||
preg_match("~^(\d+) +\(Iss(?:\.|\. | )(\d+)\)$~i", $data, $matches)
) {
$possible_volume=$matches[1];
Expand Down
10 changes: 8 additions & 2 deletions WikipediaBot.php
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ static public function GetLastUser() : string {
* @codeCoverageIgnore
*/
private function authenticate_user() : void {
@setcookie(session_name(),session_id(),time()+(7*24*3600)); // 7 days
@setcookie(session_name(),session_id(),time()+(7*24*3600), "", "", TRUE, TRUE); // 7 days
if (isset($_SESSION['citation_bot_user_id']) &&
isset($_SESSION['access_key']) &&
isset($_SESSION['access_secret']) &&
Expand All @@ -480,17 +480,23 @@ private function authenticate_user() : void {
$user = (string) $ident->username;
if (!self::is_valid_user($user)) {
unset($_SESSION['access_key'], $_SESSION['access_secret']);
report_error('User is either invalid or blocked according to ' . API_ROOT . '?action=query&usprop=blockinfo&format=json&list=users&ususers=' . urlencode(str_replace(" ", "_", $user)));
// report_error('User is either invalid or blocked according to ' . API_ROOT . '?action=query&usprop=blockinfo&format=json&list=users&ususers=' . urlencode(str_replace(" ", "_", $user)));
report_error('User ' . echoable(str_replace(" ", "_", $user)) . ' is either invalid or blocked');
}
$this->the_user = $user;
$_SESSION['citation_bot_user_id'] = $this->the_user;
session_write_close(); // Done with the session
flush(); // stability
return;
}
catch (Throwable $e) { ; }
}
if (empty($_SERVER['REQUEST_URI'])) {
$name = (string) @session_name();
$id = (string) @session_id();
session_destroy(); // This is really bad news
flush(); // Paranoid
@setcookie($name, $id, time()-42000, "", "", TRUE, TRUE);
report_error('Invalid access attempt to internal API');
} else {
unset($_SESSION['access_key'], $_SESSION['access_secret']);
Expand Down
5 changes: 5 additions & 0 deletions Zotero.php
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,11 @@ public static function expand_by_zotero(Template $template, ?string $url = NULL)
if (!$template->profoundly_incomplete($url)) return FALSE; // Only risk unvetted data if there's little good data to sully

if(stripos($url, 'CITATION_BOT_PLACEHOLDER') !== FALSE) return FALSE; // That's a bad url

// Clean up URLs
if(preg_match('~^(https?://(?:www\.|)nature\.com/articles/[a-zA-Z0-9\.]+)\.pdf(?:|\?.*)$~', $url, $matches)) {
$url = $matches[1]; // remove PDF from Nature urls
}

$bad_url = implode('|', ZOTERO_AVOID_REGEX);
if(preg_match("~^https?://(?:www\.|m\.|)(?:" . $bad_url . ")~i", $url)) return FALSE;
Expand Down
Loading

0 comments on commit de6809b

Please sign in to comment.