From 921f08660dbeab8cdd508d3c8167342429ae8ea0 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 25 Apr 2022 14:57:17 -0400
Subject: [PATCH 01/47] add tests

---
 tests/outfile.txt            |   8 ++
 tests/predictions.csv        |  35 +++++
 tests/testBib_erroneous.bib  | 268 +++++++++++++++++++++++++++++++++++
 tests/testBib_immaculate.bib | 257 +++++++++++++++++++++++++++++++++
 4 files changed, 568 insertions(+)
 create mode 100644 tests/outfile.txt
 create mode 100644 tests/predictions.csv
 create mode 100644 tests/testBib_erroneous.bib
 create mode 100644 tests/testBib_immaculate.bib

diff --git a/tests/outfile.txt b/tests/outfile.txt
new file mode 100644
index 0000000..735da3e
--- /dev/null
+++ b/tests/outfile.txt
@@ -0,0 +1,8 @@
+2.61 0.52 0.07 0.73 2.07 0.05 0.10 0.09
+0.55 0.08 0.01 0.04 0.36 0.36 0.10 0.06
+0.09 0.10 0.00 0.06 0.19 0.01 0.89 0.01
+0.24 0.04 0.01 0.63 0.09 0.02 0.01 0.36
+0.97 0.03 0.01 0.11 2.16 0.02 0.05 0.13
+0.81 0.00 0.01 0.06 0.25 0.00 0.00 0.00
+0.08 0.00 0.00 0.01 0.09 0.00 0.00 0.01
+0.04 0.00 0.00 0.00 0.49 0.00 0.01 0.01
diff --git a/tests/predictions.csv b/tests/predictions.csv
new file mode 100644
index 0000000..aeff3e8
--- /dev/null
+++ b/tests/predictions.csv
@@ -0,0 +1,35 @@
+,CitationKey,Author,Gender,W,A,GendCat
+0,buzsaki2013memory,"György,Buzsáki","male,99",0.9489384293556213,0.051061596255749464,
+1,buzsaki2013memory,"Edvard,Moser","male,99",0.8077561855316162,0.1922438140027225,malemale
+2,Lundine2019,"J,Lundine","unknown,0",0.8451798558235168,0.15482008177787066,
+3,Lundine2019,"Dina,Balabanova","female,96",0.8948734998703003,0.10512647964060307,unknownfemale
+4,wang2021gendered,"Xinyi,Wang","female,88",0.0034336799290031195,0.9965664306655526,
+5,wang2021gendered,"David,Lydon-Staley","male,99",0.9184235334396362,0.0815764528233558,femalemale
+6,moralia2005,"William,Plutarch","male,99",0.9850617051124573,0.014938272070139647,
+7,moralia2005,"William,Plutarch","male,99",0.9850617051124573,0.014938272070139647,malemale
+8,jurafsky2018n,"D,Jurafsky","unknown,0",0.9343108534812927,0.0656891418620944,
+9,jurafsky2018n,"JH,Martin","male,82",0.49100935459136963,0.5089906957000494,unknownmale
+10,maliniak2013gender,"Daniel,Maliniak","male,99",0.9659098386764526,0.03409015154466033,
+11,maliniak2013gender,"Barbara,Walter","female,98",0.9297952651977539,0.07020476355683059,malefemale
+12,caplar2017quantitative,"Neven,Caplar","male,75",0.8359426856040955,0.1640572901815176,
+13,caplar2017quantitative,"Simon,Birrer","male,98",0.599306046962738,0.4006939800456166,malemale
+14,mitchell2013gendered,"Sara,Mitchell","female,98",0.8715987205505371,0.12840125896036625,
+15,mitchell2013gendered,"Holly,Brus","female,96",0.942419707775116,0.05758026405237615,femalefemale
+16,dion2018gendered,"Michelle,Dion","female,97",0.8750959038734436,0.12490414828062057,
+17,dion2018gendered,"Sara,Mitchell","female,98",0.8715987205505371,0.12840125896036625,femalefemale
+18,ambekar2009name,"Anurag,Ambekar","male,100",0.2662626802921295,0.7337373411282897,
+19,ambekar2009name,"Steven,Skiena","male,99",0.9796334505081177,0.020366581855341792,malemale
+20,sood2018predicting,"Gaurav,Sood","male,100",0.6892917156219482,0.31070827692747116,
+21,sood2018predicting,"Suriyan,Laohaprapanon","male,96",0.06737928092479706,0.9326208103448153,malemale
+22,chatterjee2021gender,"Paula,Chatterjee","female,98",0.41062963008880615,0.5893703922629356,
+23,chatterjee2021gender,"Rachel,Werner","female,98",0.9786281585693359,0.02137181058060378,femalefemale
+24,fulvio2021imbalance,"Jacqueline,Fulvio","female,98",0.8698657751083374,0.13013425190001726,
+25,fulvio2021imbalance,"Bradley,Postle","male,99",0.9850615859031677,0.014938393025659025,femalemale
+26,ethnicolr2022black,"Denzel,Washington","male,100",0.004687963519245386,0.9953120946884155,
+27,ethnicolr2022black,"Ketanji,Brown-Jackson","unknown,0",0.004336825106292963,0.9956631234381348,maleunknown
+28,ethnicolr2022hispanic,"Rafael,Cruz","male,99",0.0451166108250618,0.9548833764856681,
+29,ethnicolr2022hispanic,"Alexandria,Ocasio-Cortez","female,97",0.02650504559278488,0.9734949340345338,malefemale
+30,ethnicolr2022asian,"Andrew,Wang","male,99",0.07631298899650574,0.923687070608139,
+31,ethnicolr2022asian,"Michelle,Yeoh","female,97",0.3952791690826416,0.6047207862138748,malefemale
+32,ethnicolr2022white,"Nicolas,Coppola","male,99",0.8281028866767883,0.17189706675708294,
+33,ethnicolr2022white,"Meryl,Streep","female,91",0.9587977528572083,0.04120224388316274,malefemale
diff --git a/tests/testBib_erroneous.bib b/tests/testBib_erroneous.bib
new file mode 100644
index 0000000..2bcc596
--- /dev/null
+++ b/tests/testBib_erroneous.bib
@@ -0,0 +1,268 @@
+@article{buzsaki2013memory,
+  title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system},
+  author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{Lundine2019,
+
+                abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.},
+
+                author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina},
+
+                doi = {10.1016/j.socscimed.2019.112388},
+
+                file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf},
+
+                issn = {18735347},
+
+                journal = {Social Science and Medicine},
+
+                keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing},
+
+                number = {January},
+
+                pages = {112388},
+
+                pmid = {31288167},
+
+                publisher = {Elsevier},
+
+                title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}},
+
+                url = {https://doi.org/10.1016/j.socscimed.2019.112388},
+
+                volume = {235},
+
+                year = {2019}
+
+}
+
+@article{wang2021gendered,
+
+                author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.},
+
+                year = {2021},
+
+                title = {Gendered citation practices in the field of communication},
+
+                journal = {Annals of the International Communication Association},
+
+                doi = {10.1080/23808985.2021.1960180},
+}
+
+@article{zurn2020network,
+  title={Network architectures supporting learnability},
+  author={Zurn, Perry and Bassett, Danielle S},
+  journal={Philosophical Transactions of the Royal Society B},
+  volume={375},
+  number={1796},
+  pages={20190323},
+  year={2020},
+  publisher={The Royal Society}
+}
+
+@article{zurn2020network,
+  title={Network architectures supporting learnability},
+  author={Zurn, Perry and Bassett, Danielle S},
+  journal={Philosophical Transactions of the Royal Society B},
+  volume={375},
+  number={1796},
+  pages={20190323},
+  year={2020},
+  publisher={The Royal Society}
+}
+
+@book{moralia2005,
+  title={Moralia, Volume VI},
+  author={Plutarch, Helmbold, William},
+  year={1939},
+  publisher={Harvard University Press}
+}
+
+@book{bassett2022curious, 
+title={Curious Minds}, 
+author={Danielle S. Bassett and Perry Zurn},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@book{fake2022, 
+title={fake}, 
+author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@article{jurafsky2018n,
+  title={N-gram language models},
+  author={Jurafsky, D and Martin, JH},
+  journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
+  year={2018}
+}
+
+@article {Dworkin2020.01.03.894378,
+  author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.},
+  title = {The extent and drivers of gender imbalance in neuroscience reference lists},
+  elocation-id = {2020.01.03.894378},
+  year = {2020},
+  doi = {10.1101/2020.01.03.894378},
+  publisher = {Cold Spring Harbor Laboratory},
+  abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.},
+  URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378},
+  eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf},
+  journal = {bioRxiv}
+}
+
+@article{maliniak2013gender,
+  title={The gender citation gap in international relations},
+  author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F},
+  journal={International Organization},
+  volume={67},
+  number={4},
+  pages={889--922},
+  year={2013},
+  publisher={Cambridge University Press}
+}
+
+@article{caplar2017quantitative,
+  title={Quantitative evaluation of gender bias in astronomical publications from citation counts},
+  author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon},
+  journal={Nature Astronomy},
+  volume={1},
+  number={6},
+  pages={0141},
+  year={2017},
+  publisher={Nature Publishing Group}
+}
+
+@article{mitchell2013gendered,
+  title={Gendered citation patterns in international relations journals},
+  author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly},
+  journal={International Studies Perspectives},
+  volume={14},
+  number={4},
+  pages={485--492},
+  year={2013},
+  publisher={Blackwell Publishing Ltd Oxford, UK}
+}
+
+@article{dion2018gendered,
+  title={Gendered citation patterns across political science and social science methodology fields},
+  author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin},
+  journal={Political Analysis},
+  volume={26},
+  number={3},
+  pages={312--327},
+  year={2018},
+  publisher={Cambridge University Press}
+}
+
+@software{zhou_dale_2020_3672110,
+  author       = {Zhou, Dale and
+                  Cornblath, Eli J. and
+                  Stiso, Jennifer and
+                  Teich, Erin G. and
+                  Dworkin, Jordan D. and
+                  Blevins, Ann S. and
+                  Bassett, Danielle S.},
+  title        = {Gender Diversity Statement and Code Notebook v1.0},
+  month        = feb,
+  year         = 2020,
+  publisher    = {Zenodo},
+  version      = {v1.0},
+  doi          = {10.5281/zenodo.3672110},
+  url          = {https://doi.org/10.5281/zenodo.3672110}
+}
+
+@inproceedings{ambekar2009name,
+  title={Name-ethnicity classification from open sources},
+  author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven},
+  booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining},
+  pages={49--58},
+  year={2009}
+}
+
+@article{sood2018predicting,
+  title={Predicting race and ethnicity from the sequence of characters in a name},
+  author={Sood, Gaurav and Laohaprapanon, Suriyan},
+  journal={arXiv preprint arXiv:1805.02109},
+  year={2018}
+}
+
+@article{bertolero2021racial,
+title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender},
+author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and  Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.},
+journal = {bioRxiv},
+year = {2020},
+xoi = {10.1101/2020.10.12.336230},
+}
+
+@article{chatterjee2021gender,
+journal = {JAMA Netw Open},
+year = {2021},
+volume = {4},
+number = {7},
+pages = {e2114509},
+title = {Gender Disparity in Citations in High-Impact Journal Articles},
+author = {Chatterjee, Paula and Werner, Rachel M},
+}
+
+@article{fulvio2021imbalance,
+title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience},
+author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R},
+journal = {J Cogn Neurosci},
+year = {2021},
+volume = {33},
+number = {1},
+pages = {3-7},
+}
+
+@article{ethnicolr2022black,
+  title={Test of ethnicolr},
+  author={Washington, Denzel and Brown-Jackson, Ketanji},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022hispanic,
+  title={Test of ethnicolr},
+  author={Cruz, Rafael and Ocasio-Cortez, Alexandria},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022asian,
+  title={Test of ethnicolr},
+  author={Wang, Andrew and Yeoh, Michelle},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022white,
+  title={Test of ethnicolr},
+  author={Coppola, Nicolas and Streep, Meryl},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
\ No newline at end of file
diff --git a/tests/testBib_immaculate.bib b/tests/testBib_immaculate.bib
new file mode 100644
index 0000000..039fa4c
--- /dev/null
+++ b/tests/testBib_immaculate.bib
@@ -0,0 +1,257 @@
+@article{buzsaki2013memory,
+  title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system},
+  author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{Lundine2019,
+
+                abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.},
+
+                author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina},
+
+                doi = {10.1016/j.socscimed.2019.112388},
+
+                file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf},
+
+                issn = {18735347},
+
+                journal = {Social Science and Medicine},
+
+                keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing},
+
+                number = {January},
+
+                pages = {112388},
+
+                pmid = {31288167},
+
+                publisher = {Elsevier},
+
+                title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}},
+
+                url = {https://doi.org/10.1016/j.socscimed.2019.112388},
+
+                volume = {235},
+
+                year = {2019}
+
+}
+
+@article{wang2021gendered,
+
+                author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.},
+
+                year = {2021},
+
+                title = {Gendered citation practices in the field of communication},
+
+                journal = {Annals of the International Communication Association},
+
+                doi = {10.1080/23808985.2021.1960180},
+}
+
+@article{zurn2020network,
+  title={Network architectures supporting learnability},
+  author={Zurn, Perry and Bassett, Danielle S},
+  journal={Philosophical Transactions of the Royal Society B},
+  volume={375},
+  number={1796},
+  pages={20190323},
+  year={2020},
+  publisher={The Royal Society}
+}
+
+@book{moralia2005,
+  title={Moralia, Volume VI},
+  author={Plutarch, Helmbold, William},
+  year={1939},
+  publisher={Harvard University Press}
+}
+
+@book{bassett2022curious, 
+title={Curious Minds}, 
+author={Danielle S. Bassett and Perry Zurn},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@book{fake2022, 
+title={fake}, 
+author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@article{jurafsky2018n,
+  title={N-gram language models},
+  author={Jurafsky, D and Martin, JH},
+  journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
+  year={2018}
+}
+
+@article {Dworkin2020.01.03.894378,
+  author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.},
+  title = {The extent and drivers of gender imbalance in neuroscience reference lists},
+  elocation-id = {2020.01.03.894378},
+  year = {2020},
+  doi = {10.1101/2020.01.03.894378},
+  publisher = {Cold Spring Harbor Laboratory},
+  abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.},
+  URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378},
+  eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf},
+  journal = {bioRxiv}
+}
+
+@article{maliniak2013gender,
+  title={The gender citation gap in international relations},
+  author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F},
+  journal={International Organization},
+  volume={67},
+  number={4},
+  pages={889--922},
+  year={2013},
+  publisher={Cambridge University Press}
+}
+
+@article{caplar2017quantitative,
+  title={Quantitative evaluation of gender bias in astronomical publications from citation counts},
+  author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon},
+  journal={Nature Astronomy},
+  volume={1},
+  number={6},
+  pages={0141},
+  year={2017},
+  publisher={Nature Publishing Group}
+}
+
+@article{mitchell2013gendered,
+  title={Gendered citation patterns in international relations journals},
+  author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly},
+  journal={International Studies Perspectives},
+  volume={14},
+  number={4},
+  pages={485--492},
+  year={2013},
+  publisher={Blackwell Publishing Ltd Oxford, UK}
+}
+
+@article{dion2018gendered,
+  title={Gendered citation patterns across political science and social science methodology fields},
+  author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin},
+  journal={Political Analysis},
+  volume={26},
+  number={3},
+  pages={312--327},
+  year={2018},
+  publisher={Cambridge University Press}
+}
+
+@software{zhou_dale_2020_3672110,
+  author       = {Zhou, Dale and
+                  Cornblath, Eli J. and
+                  Stiso, Jennifer and
+                  Teich, Erin G. and
+                  Dworkin, Jordan D. and
+                  Blevins, Ann S. and
+                  Bassett, Danielle S.},
+  title        = {Gender Diversity Statement and Code Notebook v1.0},
+  month        = feb,
+  year         = 2020,
+  publisher    = {Zenodo},
+  version      = {v1.0},
+  doi          = {10.5281/zenodo.3672110},
+  url          = {https://doi.org/10.5281/zenodo.3672110}
+}
+
+@inproceedings{ambekar2009name,
+  title={Name-ethnicity classification from open sources},
+  author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven},
+  booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining},
+  pages={49--58},
+  year={2009}
+}
+
+@article{sood2018predicting,
+  title={Predicting race and ethnicity from the sequence of characters in a name},
+  author={Sood, Gaurav and Laohaprapanon, Suriyan},
+  journal={arXiv preprint arXiv:1805.02109},
+  year={2018}
+}
+
+@article{bertolero2021racial,
+title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender},
+author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and  Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.},
+journal = {bioRxiv},
+year = {2020},
+xoi = {10.1101/2020.10.12.336230},
+}
+
+@article{chatterjee2021gender,
+journal = {JAMA Netw Open},
+year = {2021},
+volume = {4},
+number = {7},
+pages = {e2114509},
+title = {Gender Disparity in Citations in High-Impact Journal Articles},
+author = {Chatterjee, Paula and Werner, Rachel M},
+}
+
+@article{fulvio2021imbalance,
+title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience},
+author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R},
+journal = {J Cogn Neurosci},
+year = {2021},
+volume = {33},
+number = {1},
+pages = {3-7},
+}
+
+@article{ethnicolr2022black,
+  title={Test of ethnicolr},
+  author={Washington, Denzel and Brown-Jackson, Ketanji},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022hispanic,
+  title={Test of ethnicolr},
+  author={Cruz, Rafael and Ocasio-Cortez, Alexandria},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022asian,
+  title={Test of ethnicolr},
+  author={Wang, Andrew and Yeoh, Michelle},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022white,
+  title={Test of ethnicolr},
+  author={Coppola, Nicolas and Streep, Meryl},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
\ No newline at end of file

From 5ac8bef9e91eeaf7cc3b103e17b87a561d5288d4 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Mon, 25 Apr 2022 15:10:25 -0400
Subject: [PATCH 02/47] added existing fucntions to scripts

---
 utils/__init__.py      |  0
 utils/preprocessing.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 utils/queries.py       | 51 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 utils/__init__.py
 create mode 100644 utils/preprocessing.py
 create mode 100644 utils/queries.py

diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/preprocessing.py b/utils/preprocessing.py
new file mode 100644
index 0000000..10933ac
--- /dev/null
+++ b/utils/preprocessing.py
@@ -0,0 +1,55 @@
+def checkcites_output(aux_file):
+    '''take in aux file for tex document, return list of citation keys
+    that are in .bib file but not in document'''
+
+    result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE)
+    result = result.stdout.decode('utf-8')
+    unused_array_raw = result.split('\n')
+    # process array of unused references + other output
+    unused_array_final = list()
+    for x in unused_array_raw:
+        if len(x) > 0: # if line is not empty
+            if x[0] == '-':  # and if first character is a '-', it's a citation key
+                unused_array_final.append(x[2:]) # truncate '- '
+    if "------------------------------------------------------------------------" in unused_array_final:
+        return(result)
+    else:
+        return(unused_array_final)
+
+
+def removeMiddleName(line):
+    arr = line.split()
+    last = arr.pop()
+    n = len(arr)
+    if n == 4:
+        first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])
+    elif n == 3:
+        first, middle = arr[0], ' '.join(arr[1:])
+    elif n == 2:
+        first, middle = arr
+    elif n==1:
+        return line
+    return(str(first + ' ' + middle))
+
+
+def returnFirstName(line):
+    arr = line.split()
+    n = len(arr)
+    if n == 4:
+        first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])
+    elif n == 3:
+        first, middle = arr[0], ' '.join(arr[1:])
+    elif n == 2:
+        first, middle = arr
+    elif n==1:
+        return line
+    return(str(middle))
+
+
+def convertLatexSpecialChars(latex_text):
+    return LatexNodes2Text().latex_to_text(latex_text)
+
+
+def convertSpecialCharsToUTF8(text):
+    data = LatexNodes2Text().latex_to_text(text)
+    return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')
diff --git a/utils/queries.py b/utils/queries.py
new file mode 100644
index 0000000..7d0987e
--- /dev/null
+++ b/utils/queries.py
@@ -0,0 +1,51 @@
+def namesFromXref(doi, title, authorPos):
+    '''Use DOI and article titles to query Crossref for author list'''
+    if authorPos == 'first':
+        idx = 0
+    elif authorPos == 'last':
+        idx = -1
+    # get cross ref data
+    authors = ['']
+    # first try DOI
+    if doi != "":
+        works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi})
+        if works['message']['total-results'] > 0:
+            authors = works['message']['items'][0]['author']
+    elif title != '':
+        works = cr.works(query=f'title:"{title}"', select=["title", "author"], limit=10)
+        cnt = 0
+        name = ''
+        # check that you grabbed the proper paper
+        if works['message']['items'][cnt]['title'][0].lower() == title.lower():
+            authors = works['message']['items'][0]['author']
+
+    # check the all fields are available
+    if not 'given' in authors[idx]:
+        name = ''
+    else:
+        # trim initials
+        name = authors[idx]['given'].replace('.', ' ').split()[0]
+
+    return name
+
+
+def namesFromXrefSelfCite(doi, title):
+    selfCiteCheck = 0
+    # get cross ref data
+    authors = ['']
+    # first try DOI
+    if doi != "":
+        works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi})
+        if works['message']['total-results'] > 0:
+            authors = works['message']['items'][0]['author']
+
+    for i in authors:
+        if i != "":
+            first = i['given'].replace('.', ' ').split()[0]
+            last = i['family'].replace('.', ' ').split()[0]
+            authors = removeMiddleName(last + ", " + first)
+            if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName(
+                    convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(
+                    yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):
+                selfCiteCheck += 1
+    return selfCiteCheck

From 9470d693511a2ead2b48d0dbf81026240a52ed17 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 25 Apr 2022 15:13:52 -0400
Subject: [PATCH 03/47] get rid of r files

---
 environment.yaml | 2 --
 install.R        | 2 --
 postBuild        | 1 -
 requirements.txt | 2 --
 4 files changed, 7 deletions(-)
 delete mode 100644 install.R
 delete mode 100644 postBuild

diff --git a/environment.yaml b/environment.yaml
index f6d8b41..e1d9167 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -16,8 +16,6 @@ dependencies:
     - pandas
     - re
     - pylatexenc
-    - sos
-    - sos-notebook
     - habanero
     - tqdm
     - json
diff --git a/install.R b/install.R
deleted file mode 100644
index 87f4b40..0000000
--- a/install.R
+++ /dev/null
@@ -1,2 +0,0 @@
-install.packages('rjson')
-install.packages('ggplot2')
\ No newline at end of file
diff --git a/postBuild b/postBuild
deleted file mode 100644
index 279de15..0000000
--- a/postBuild
+++ /dev/null
@@ -1 +0,0 @@
-python3 -m sos_notebook.install
diff --git a/requirements.txt b/requirements.txt
index 1bc7eef..a4ffc94 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,8 +3,6 @@ numpy==1.19.2
 bibtexparser==1.1.0
 pandas==1.1.3
 pylatexenc==2.1
-sos==0.21.5
-sos-notebook==0.21.7
 habanero==0.7.2
 ethnicolr==0.4.0
 matplotlib==3.3.2

From 2513e4eb7ee3fc69f28339d85d6210c469f0505f Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Mon, 25 Apr 2022 15:16:53 -0400
Subject: [PATCH 04/47] added imports to function scripts

---
 utils/preprocessing.py | 33 +++++++++++++++++++++++++++++----
 utils/queries.py       | 21 +--------------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index 10933ac..09f5224 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -1,3 +1,7 @@
+import subprocess
+from pylatexenc.latex2text import LatexNodes2Text
+import unicodedata
+
 def checkcites_output(aux_file):
     '''take in aux file for tex document, return list of citation keys
     that are in .bib file but not in document'''
@@ -12,9 +16,9 @@ def checkcites_output(aux_file):
             if x[0] == '-':  # and if first character is a '-', it's a citation key
                 unused_array_final.append(x[2:]) # truncate '- '
     if "------------------------------------------------------------------------" in unused_array_final:
-        return(result)
+        return result
     else:
-        return(unused_array_final)
+        return unused_array_final
 
 
 def removeMiddleName(line):
@@ -29,7 +33,7 @@ def removeMiddleName(line):
         first, middle = arr
     elif n==1:
         return line
-    return(str(first + ' ' + middle))
+    return str(first + ' ' + middle)
 
 
 def returnFirstName(line):
@@ -43,7 +47,7 @@ def returnFirstName(line):
         first, middle = arr
     elif n==1:
         return line
-    return(str(middle))
+    return str(middle)
 
 
 def convertLatexSpecialChars(latex_text):
@@ -53,3 +57,24 @@ def convertLatexSpecialChars(latex_text):
 def convertSpecialCharsToUTF8(text):
     data = LatexNodes2Text().latex_to_text(text)
     return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')
+
+def namesFromXrefSelfCite(doi, title):
+    selfCiteCheck = 0
+    # get cross ref data
+    authors = ['']
+    # first try DOI
+    if doi != "":
+        works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi})
+        if works['message']['total-results'] > 0:
+            authors = works['message']['items'][0]['author']
+
+    for i in authors:
+        if i != "":
+            first = i['given'].replace('.', ' ').split()[0]
+            last = i['family'].replace('.', ' ').split()[0]
+            authors = removeMiddleName(last + ", " + first)
+            if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName(
+                    convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(
+                    yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):
+                selfCiteCheck += 1
+    return selfCiteCheck
\ No newline at end of file
diff --git a/utils/queries.py b/utils/queries.py
index 7d0987e..7cac1ac 100644
--- a/utils/queries.py
+++ b/utils/queries.py
@@ -1,4 +1,4 @@
-def namesFromXref(doi, title, authorPos):
+def namesFromXref(cr, doi, title, authorPos):
     '''Use DOI and article titles to query Crossref for author list'''
     if authorPos == 'first':
         idx = 0
@@ -29,23 +29,4 @@ def namesFromXref(doi, title, authorPos):
     return name
 
 
-def namesFromXrefSelfCite(doi, title):
-    selfCiteCheck = 0
-    # get cross ref data
-    authors = ['']
-    # first try DOI
-    if doi != "":
-        works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi})
-        if works['message']['total-results'] > 0:
-            authors = works['message']['items'][0]['author']
 
-    for i in authors:
-        if i != "":
-            first = i['given'].replace('.', ' ').split()[0]
-            last = i['family'].replace('.', ' ').split()[0]
-            authors = removeMiddleName(last + ", " + first)
-            if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName(
-                    convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(
-                    yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):
-                selfCiteCheck += 1
-    return selfCiteCheck

From ff38c5010ca2eedc83e7efa25728537819ff1a23 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 2 May 2022 14:49:22 -0400
Subject: [PATCH 05/47] add test aux

---
 tests/document.aux | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/document.aux

diff --git a/tests/document.aux b/tests/document.aux
new file mode 100644
index 0000000..ed771f5
--- /dev/null
+++ b/tests/document.aux
@@ -0,0 +1,45 @@
+\relax 
+\citation{Lundine2019}
+\citation{wang2021gendered}
+\citation{zurn2020network}
+\citation{moralia2005}
+\citation{bassett2022curious}
+\citation{fake2022}
+\citation{Dworkin2020.01.03.894378}
+\citation{maliniak2013gender}
+\citation{caplar2017quantitative}
+\citation{mitchell2013gendered}
+\citation{dion2018gendered}
+\citation{zhou_dale_2020_3672110}
+\citation{ambekar2009name}
+\citation{sood2018predicting}
+\citation{bertolero2021racial}
+\citation{chatterjee2021gender}
+\citation{fulvio2021imbalance}
+\citation{ethnicolr2022black}
+\citation{ethnicolr2022hispanic}
+\citation{ethnicolr2022asian}
+\citation{ethnicolr2022white}
+\bibstyle{ieeetr}
+\bibdata{testBib_immaculate.bib}
+\bibcite{Lundine2019}{1}
+\bibcite{wang2021gendered}{2}
+\bibcite{zurn2020network}{3}
+\bibcite{moralia2005}{4}
+\bibcite{bassett2022curious}{5}
+\bibcite{fake2022}{6}
+\bibcite{Dworkin2020.01.03.894378}{7}
+\bibcite{maliniak2013gender}{8}
+\bibcite{caplar2017quantitative}{9}
+\bibcite{mitchell2013gendered}{10}
+\bibcite{dion2018gendered}{11}
+\bibcite{zhou_dale_2020_3672110}{12}
+\bibcite{ambekar2009name}{13}
+\bibcite{sood2018predicting}{14}
+\bibcite{bertolero2021racial}{15}
+\bibcite{chatterjee2021gender}{16}
+\bibcite{fulvio2021imbalance}{17}
+\bibcite{ethnicolr2022black}{18}
+\bibcite{ethnicolr2022hispanic}{19}
+\bibcite{ethnicolr2022asian}{20}
+\bibcite{ethnicolr2022white}{21}

From 22bc19ebef833a52ba2d2b710e298d290e78a7c0 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Mon, 2 May 2022 15:08:47 -0400
Subject: [PATCH 06/47] breaking first cell into fn units

---
 .gitignore             |   1 +
 requirements.txt       | 136 ++++++++++++--
 utils/preprocessing.py | 397 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 518 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index 10346f0..c7b6bf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 .ipynb_checkpoints/*
+env/*
diff --git a/requirements.txt b/requirements.txt
index a4ffc94..ff17b13 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,121 @@
-pybtex==0.22.2
-numpy==1.19.2
-bibtexparser==1.1.0
-pandas==1.1.3
-pylatexenc==2.1
-habanero==0.7.2
-ethnicolr==0.4.0
-matplotlib==3.3.2
-seaborn==0.11.0
-scipy==1.5.3
-h5py==2.10.0
-oauthlib==3.0.1
-rsa==4.7
-Keras==2.2.4
-tensorflow==1.15.2
\ No newline at end of file
+absl-py==0.15.0
+appnope==0.1.3
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+astunparse==1.6.3
+attrs==21.4.0
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bibtexparser==1.2.0
+bleach==5.0.0
+cachetools==5.0.0
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+cycler==0.11.0
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+entrypoints==0.4
+ethnicolr==0.8.1
+executing==0.8.3
+fastjsonschema==2.15.3
+flatbuffers==1.12
+fonttools==4.33.3
+future==0.18.2
+gast==0.4.0
+google-auth==2.6.6
+google-auth-oauthlib==0.4.6
+google-pasta==0.2.0
+grpcio==1.34.1
+h5py==3.1.0
+habanero==1.2.0
+idna==3.3
+importlib-metadata==4.11.3
+ipykernel==6.13.0
+ipython==8.3.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.0
+jedi==0.18.1
+Jinja2==3.1.2
+jsonschema==4.4.0
+jupyter==1.0.0
+jupyter-client==7.3.0
+jupyter-console==6.4.3
+jupyter-core==4.10.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==1.1.0
+keras==2.8.0
+keras-nightly==2.5.0.dev2021032900
+Keras-Preprocessing==1.1.2
+kiwisolver==1.4.2
+latexcodec==2.0.1
+Markdown==3.3.6
+MarkupSafe==2.1.1
+matplotlib==3.5.1
+matplotlib-inline==0.1.3
+mistune==0.8.4
+nbclient==0.6.0
+nbconvert==6.5.0
+nbformat==5.3.0
+nest-asyncio==1.5.5
+notebook==6.4.11
+numpy==1.19.5
+oauthlib==3.2.0
+opt-einsum==3.3.0
+packaging==21.3
+pandas==1.4.2
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.1.0
+prometheus-client==0.14.1
+prompt-toolkit==3.0.29
+protobuf==3.20.1
+psutil==5.9.0
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pybtex==0.24.0
+pycparser==2.21
+Pygments==2.12.0
+pylatexenc==2.10
+pyparsing==3.0.8
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.1
+PyYAML==6.0
+pyzmq==22.3.0
+qtconsole==5.3.0
+QtPy==2.1.0
+requests==2.27.1
+requests-oauthlib==1.3.1
+rsa==4.8
+scipy==1.8.0
+seaborn==0.11.2
+Send2Trash==1.8.0
+six==1.15.0
+soupsieve==2.3.2.post1
+stack-data==0.2.0
+tensorboard==2.9.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.5.2
+tensorflow-estimator==2.5.0
+termcolor==1.1.0
+terminado==0.13.3
+tinycss2==1.1.1
+tornado==6.1
+tqdm==4.64.0
+traitlets==5.1.1
+typing-extensions==3.7.4.3
+urllib3==1.26.9
+wcwidth==0.2.5
+webencodings==0.5.1
+Werkzeug==2.1.2
+widgetsnbextension==3.6.0
+wrapt==1.12.1
+zipp==3.8.0
diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index 09f5224..890a056 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -20,8 +20,34 @@ def checkcites_output(aux_file):
     else:
         return unused_array_final
 
+def clean_name(name, flag):
+    """
+
+    :param name:
+            flag: utf or latex
+    :return: clean_name
+    """
+    if flag=='latex':
+        clean_name = convertLatexSpecialChars(str(name)[7:-3]).replace(
+            "', Protected('", ""
+        ).replace(
+            "'), '", ""
+        )
+    elif flag=='utf':
+        clean_name = convertSpecialCharsToUTF8(str(name)[7:-3]).replace(
+            "', Protected('", ""
+        ).replace(
+            "'), '", ""
+        )
+    else:
+        raise ValueError
 
 def removeMiddleName(line):
+    """
+
+    :param line:
+    :return:
+    """
     arr = line.split()
     last = arr.pop()
     n = len(arr)
@@ -37,6 +63,11 @@ def removeMiddleName(line):
 
 
 def returnFirstName(line):
+    """
+
+    :param line:
+    :return:
+    """
     arr = line.split()
     n = len(arr)
     if n == 4:
@@ -51,14 +82,30 @@ def returnFirstName(line):
 
 
 def convertLatexSpecialChars(latex_text):
+    """
+
+    :param latex_text:
+    :return:
+    """
     return LatexNodes2Text().latex_to_text(latex_text)
 
 
 def convertSpecialCharsToUTF8(text):
+    """
+
+    :param text:
+    :return:
+    """
     data = LatexNodes2Text().latex_to_text(text)
     return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')
 
 def namesFromXrefSelfCite(doi, title):
+    """
+
+    :param doi:
+    :param title:
+    :return:
+    """
     selfCiteCheck = 0
     # get cross ref data
     authors = ['']
@@ -77,4 +124,352 @@ def namesFromXrefSelfCite(doi, title):
                     convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(
                     yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):
                 selfCiteCheck += 1
-    return selfCiteCheck
\ No newline at end of file
+    return selfCiteCheck
+
+
+def find_unused_cites(paper_aux_file):
+    """
+
+    :param paper_aux_file: path to auxfile
+    :return:
+    """
+    print(checkcites_output(paper_aux_file))
+    unused_in_paper = checkcites_output(paper_aux_file)  # get citations in library not used in paper
+    print("Unused citations: ", unused_in_paper.count('=>'))
+
+def get_bib_data(homedir):
+    """
+
+    :param homedir: home directory
+    :return: bib_data
+    """
+    ID = glob.glob(homedir + '*bib')
+    with open(ID[0]) as bibtex_file:
+        bib_data = bibtexparser.bparser.BibTexParser(common_strings=True,
+                                                     ignore_nonstandard_types=False).parse_file(bibtex_file)
+    return bib_data
+
+def get_duplicates(bib_data):
+    """
+    take bib_data, and get duplicates
+    :param homedir: home directory
+    :return:
+    """
+
+    duplicates = []
+    for key in bib_data.entries_dict.keys():
+        count = str(bib_data.entries).count("'ID\': \'" + key + "\'")
+        if count > 1:
+            duplicates.append(key)
+
+    if len(duplicates) > 0:
+        raise ValueError("In your .bib file, we found and removed duplicate entries for:",
+                         ' '.join(map(str, duplicates)))
+
+
+def get_names_published(outPath, bib_data):
+    """
+    whole pipeline for published papers
+    :return: FA,
+            LA
+    """
+    FA = []
+    LA = []
+    counter = 1
+    selfCiteCount = 0
+    titleCount = 1  #
+    counterNoDOI = list()  # row index (titleCount) of entries with no DOI
+    outPath = homedir + 'cleanedBib.csv'
+
+    if os.path.exists(outPath):
+        os.remove(outPath)
+
+    with open(outPath, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])
+
+    citedArticleDOI = list()
+    citedArticleNoDOI = list()
+    allArticles = list()
+    for entry in bib_data.entries:
+        my_string = entry['cited-references'].split('\n')
+        for citedArticle in my_string:
+            allArticles.append(citedArticle)
+            if citedArticle.partition("DOI ")[-1] == '':
+                citedArticleNoDOI.append(citedArticle)
+                counterNoDOI.append(titleCount)
+            else:
+                line = citedArticle.partition("DOI ")[-1].replace("DOI ", "").rstrip(".")
+                line = ''.join(c for c in line if c not in '{[}] ')
+                if "," in line:
+                    line = line.partition(",")[-1]
+                citedArticleDOI.append(line)
+                with open('citedArticlesDOI.csv', 'a', newline='') as csvfile:
+                    writer = csv.writer(csvfile, delimiter=',')
+                    writer.writerow([line])
+            titleCount += 1
+
+    articleNum = 0
+    for doi in citedArticleDOI:
+        try:
+            FA = namesFromXref(doi, '', 'first')
+        except UnboundLocalError:
+            sleep(1)
+            continue
+
+        try:
+            LA = namesFromXref(doi, '', 'last')
+        except UnboundLocalError:
+            sleep(1)
+            continue
+
+        try:
+            selfCiteCount = namesFromXrefSelfCite(doi, '')
+        except UnboundLocalError:
+            sleep(1)
+            continue
+
+        with open(outPath, 'a', newline='') as csvfile:
+            if selfCiteCount == 0:
+                writer = csv.writer(csvfile, delimiter=',')
+                getArticleIndex = [i for i, s in enumerate(allArticles) if doi in s]
+                writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA),
+                                 allArticles[[i for i, s in enumerate(allArticles) if doi in s][0]], '', ''])
+                print(str(counter) + ": " + doi)
+                counter += 1
+            else:
+                print(str(articleNum) + ": " + doi + "\t\t\t <-- self-citation")
+        articleNum += 1
+
+    if len(citedArticleNoDOI) > 0:
+        print()
+        for elem in citedArticleNoDOI:
+            with open(outPath, 'a', newline='') as csvfile:
+                writer = csv.writer(csvfile, delimiter=',')
+                writer.writerow([counter, '', '', elem, '', ''])
+                print(str(counter) + ": " + elem)
+            counter += 1
+        print()
+        raise ValueError("WARNING: No article DOI was provided for the last " + str(
+            len(citedArticleNoDOI)) + " listed papers. Please manually search for these articles. IF AND ONLY IF your citing paper's first and last author are not co-authors in the paper that was cited, enter the first name of the first and last authors of the paper that was cited manually. Then, continue to the next code block.")
+
+    return FA, LA
+
+
+def get_names(bib_data):
+    """
+    take bib_data, and get lists of first and last names. should also get self cites and CDS cites
+    :return: FA
+              LA
+    """
+    counter = 1
+    nameCount = 0
+    outPath = homedir + 'cleanedBib.csv'
+
+    if os.path.exists(outPath):
+        os.remove(outPath)
+
+    with open(outPath, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])
+
+    for key in bib_data.entries.keys():
+        diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',
+                                'The gender citation gap in international relations',
+                                'Quantitative evaluation of gender bias in astronomical publications from citation counts',
+                                '\# CommunicationSoWhite',
+                                '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',
+                                'Gendered citation patterns across political science and social science methodology fields',
+                                'Gender Diversity Statement and Code Notebook v1.0',
+                                'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender',
+                                'Gender Diversity Statement and Code Notebook v1.1',
+                                'Gendered citation practices in the field of communication',
+                                'Gender disparity in citations in high- impact journal articles',
+                                'Gender (im)balance in citation practices in cognitive neuroscience',
+                                'Name-ethnicity classification from open sources',
+                                'Predicting race and ethnicity from the sequence of characters in a name']
+        if bib_data.entries[key].fields['title'] in diversity_bib_titles:
+            continue
+
+        try:
+            author = bib_data.entries[key].persons['author']
+        except:
+            author = bib_data.entries[key].persons['editor']
+        FA = author[0].rich_first_names
+        LA = author[-1].rich_first_names
+        FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace(
+            'Protected', "").replace(" ", '')
+        LA = convertLatexSpecialChars(str(LA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace(
+            'Protected', "").replace(" ", '')
+
+        # check if we grabbed a first initial when a full middle name was available
+        if (len(FA) == 1):
+            mn = author[0].rich_middle_names
+            mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(
+                str.maketrans('', '', string.punctuation)).replace('Protected', "").replace(" ", '')
+            if len(mn) > 1:
+                FA = mn
+        if (len(LA) == 1):
+            mn = author[-1].rich_middle_names
+            mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(
+                str.maketrans('', '', string.punctuation)).replace('Protected', "").replace(" ", '')
+            if len(mn) > 1:
+                LA = mn
+
+        # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API
+        try:
+            title = bib_data.entries[key].fields['title'].replace(',', '').\
+                replace(',', '').replace('{', '').replace('}','')
+        except:
+            title = ''
+        try:
+            doi = bib_data.entries[key].fields['doi']
+        except:
+            doi = ''
+        if FA == '' or len(FA.split('.')[0]) <= 1:
+            while True:
+                try:
+                    FA = namesFromXref(doi, title, 'first')
+                except UnboundLocalError:
+                    sleep(1)
+                    continue
+                break
+        if LA == '' or len(LA.split('.')[0]) <= 1:
+            while True:
+                try:
+                    LA = namesFromXref(doi, title, 'last')
+                except UnboundLocalError:
+                    sleep(1)
+                    continue
+                break
+
+        self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors)
+        counter += 1
+        with open(outPath, 'a', newline='') as csvfile:
+            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+            writer.writerow(
+                [counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key])
+
+
+def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors):
+    """
+    take author list, and find self citations
+
+    :param author:
+    :param yourFirstAuthor:
+    :param yourLastAuthor:
+    :param optionalEqualContributors:
+    :return:
+    """
+    if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or (
+            yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'):
+        raise ValueError("Please enter your manuscript's first and last author names")
+
+    selfCiteCheck1 = [s for s in author if removeMiddleName(yourLastAuthor) in
+                      str(
+                          [clean_name(s.rich_last_names, 'latex'),
+                           clean_name(s.rich_first_names, 'latex')]
+                      ).replace("'", "")]
+
+    selfCiteCheck1a = [s for s in author if removeMiddleName(yourLastAuthor) in
+                      str(
+                          [clean_name(s.rich_last_names, 'utf'),
+                           clean_name(s.rich_first_names, 'utf')]
+                      ).replace("'", "")]
+    selfCiteCheck1b = [s for s in author if removeMiddleName(yourLastAuthor) in
+                       str(
+                           [clean_name(s.rich_last_names, 'utf'),
+                            LA]).replace("'","")]
+    # I was in the process of cleaning all thisup when we stopped
+    selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
+        convertLatexSpecialChars(
+            str(s.rich_last_names)[
+            7:-3]).replace(
+            "', Protected('",
+            "").replace(
+            "'), '", ""),
+        convertLatexSpecialChars(
+            str(s.rich_first_names)[
+            7:-3]).replace(
+            "', Protected('",
+            "").replace(
+            "'), '",
+            "")]).replace(
+        "'", "")]
+    selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
+        convertSpecialCharsToUTF8(
+            str(s.rich_last_names)[
+            7:-3]).replace(
+            "', Protected('",
+            "").replace(
+            "'), '", ""),
+        convertSpecialCharsToUTF8(
+            str(s.rich_first_names)[
+            7:-3]).replace(
+            "', Protected('",
+            "").replace(
+            "'), '",
+            "")]).replace(
+        "'", "")]
+    selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
+        convertSpecialCharsToUTF8(
+            str(s.rich_last_names)[
+            7:-3]).replace(
+            "', Protected('",
+            "").replace(
+            "'), '", ""),
+        FA]).replace("'",
+                     "")]
+
+    nameCount = 0
+    if optionalEqualContributors != (
+            'LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):
+        for name in optionalEqualContributors:
+            selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars(
+                str(s.rich_last_names)[7:-3]).replace("', Protected('", "").replace("'), '", ""),
+                                                                                  convertLatexSpecialChars(
+                                                                                      str(s.rich_first_names)[
+                                                                                      7:-3]).replace(
+                                                                                      "', Protected('",
+                                                                                      "").replace("'), '",
+                                                                                                  "")]).replace(
+                "'", "")]
+            selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([
+                convertSpecialCharsToUTF8(
+                    str(s.rich_last_names)[
+                    7:-3]).replace(
+                    "', Protected('",
+                    "").replace(
+                    "'), '", ""),
+                convertSpecialCharsToUTF8(
+                    str(s.rich_first_names)[
+                    7:-3]).replace(
+                    "', Protected('",
+                    "").replace(
+                    "'), '",
+                    "")]).replace("'",
+                                  "")]
+            if len(selfCiteCheck3) > 0:
+                nameCount += 1
+            if len(selfCiteCheck3a) > 0:
+                nameCount += 1
+    selfCiteChecks = [selfCiteCheck1, selfCiteCheck1a, selfCiteCheck1b, selfCiteCheck2, selfCiteCheck2a,
+                      selfCiteCheck2b]
+    if sum([len(check) for check in selfCiteChecks]) + nameCount > 0:
+        selfCite = 'Y'
+        if len(FA) < 2:
+            print(
+                str(counter) + ": " + key + "\t\t  <-- self-citation <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***")
+        else:
+            print(str(counter) + ": " + key + "  <-- self-citation")
+    else:
+        selfCite = 'N'
+        if len(FA) < 2:
+            print(str(counter) + ": " + key + "\t\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***")
+        else:
+            print(str(counter) + ": " + key)
+
+
+
+
+

From eec793ed3b6b5ea790d9a4e5df8f1451521cca59 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Mon, 9 May 2022 16:19:32 -0400
Subject: [PATCH 07/47] added pipeline and got imports working

---
 tests/__init__.py |  0
 tests/pipeline.py | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/pipeline.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/pipeline.py b/tests/pipeline.py
new file mode 100644
index 0000000..f0e1e70
--- /dev/null
+++ b/tests/pipeline.py
@@ -0,0 +1,37 @@
+import glob
+from habanero import Crossref
+import sys
+import os
+wd = os.getcwd()
+print(f'{wd[0:-6]}/utils')
+print(f'{wd[0:-6]}/utils')
+sys.path.insert(1, f'{wd[0:-6]}/utils')
+from preprocessing import *
+
+cr = Crossref()
+homedir = '/home/jovyan/'
+bib_files = glob.glob(homedir + '*.bib')
+paper_aux_file = glob.glob(homedir + '*.aux')
+paper_bib_file = 'library_paper.bib'
+try:
+    tex_file = glob.glob(homedir + "*.tex")[0]
+except:
+    print('No optional .tex file found.')
+
+yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial'
+yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial'
+optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']
+checkingPublishedArticle = False
+
+## end of user input
+if paper_aux_file:
+    find_unused_cites(paper_aux_file)
+
+bib_data = get_bib_data(homedir)
+if checkingPublishedArticle:
+    FA,LA = get_names_published(homedir, bib_data)
+else:
+    # find and print duplicates
+    get_duplicates(bib_data)
+    # get names, remove CDS, find self cites
+    FA,LA = get_names(bib_data)
\ No newline at end of file

From 65465c9f94be824e0e43817afa8e2543fcffabca Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Tue, 10 May 2022 14:31:46 -0400
Subject: [PATCH 08/47] added needed imports and arguments for 1st cell
 functions

---
 tests/immaculate/cleanedBib.csv         |  15 ++
 tests/immaculate/pipeline.py            |  37 ++++
 tests/immaculate/testBib_immaculate.bib | 257 ++++++++++++++++++++++++
 3 files changed, 309 insertions(+)
 create mode 100644 tests/immaculate/cleanedBib.csv
 create mode 100644 tests/immaculate/pipeline.py
 create mode 100644 tests/immaculate/testBib_immaculate.bib

diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv
new file mode 100644
index 0000000..fb49df2
--- /dev/null
+++ b/tests/immaculate/cleanedBib.csv
@@ -0,0 +1,15 @@
+Article,FA,LA,Title,SelfCite,CitationKey
+2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory
+3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019
+4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network
+5,William,William,Moralia Volume VI,N,moralia2005
+6,Danielle,Perry,Curious Minds,N,bassett2022curious
+7,Danielle,Jennifer,fake,Y,fake2022
+8,,JH,N-gram language models,N,jurafsky2018n
+9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered
+10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender
+11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance
+12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black
+13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic
+14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian
+15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white
diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py
new file mode 100644
index 0000000..999665b
--- /dev/null
+++ b/tests/immaculate/pipeline.py
@@ -0,0 +1,37 @@
+import glob
+from habanero import Crossref
+import sys
+import os
+from pathlib import Path
+wd = Path(os.getcwd())
+sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')
+from preprocessing import *
+
+cr = Crossref()
+#homedir = '/home/jovyan/'
+homedir = os.getcwd() + '/'
+bib_files = glob.glob(homedir + '*.bib')
+paper_aux_file = glob.glob(homedir + '*.aux')
+paper_bib_file = 'library_paper.bib'
+try:
+    tex_file = glob.glob(homedir + "*.tex")[0]
+except:
+    print('No optional .tex file found.')
+
+yourFirstAuthor = 'Stiso, Jennifer '
+yourLastAuthor = 'Bassett, Dani '
+optionalEqualContributors = ['Zhou, Dale']
+checkingPublishedArticle = False
+
+## end of user input
+if paper_aux_file:
+    find_unused_cites(paper_aux_file)
+
+bib_data = get_bib_data(homedir)
+if checkingPublishedArticle:
+    get_names_published(homedir, bib_data, cr)
+else:
+    # find and print duplicates
+    get_duplicates(bib_data)
+    # get names, remove CDS, find self cites
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/tests/immaculate/testBib_immaculate.bib b/tests/immaculate/testBib_immaculate.bib
new file mode 100644
index 0000000..039fa4c
--- /dev/null
+++ b/tests/immaculate/testBib_immaculate.bib
@@ -0,0 +1,257 @@
+@article{buzsaki2013memory,
+  title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system},
+  author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{Lundine2019,
+
+                abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.},
+
+                author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina},
+
+                doi = {10.1016/j.socscimed.2019.112388},
+
+                file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf},
+
+                issn = {18735347},
+
+                journal = {Social Science and Medicine},
+
+                keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing},
+
+                number = {January},
+
+                pages = {112388},
+
+                pmid = {31288167},
+
+                publisher = {Elsevier},
+
+                title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}},
+
+                url = {https://doi.org/10.1016/j.socscimed.2019.112388},
+
+                volume = {235},
+
+                year = {2019}
+
+}
+
+@article{wang2021gendered,
+
+                author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.},
+
+                year = {2021},
+
+                title = {Gendered citation practices in the field of communication},
+
+                journal = {Annals of the International Communication Association},
+
+                doi = {10.1080/23808985.2021.1960180},
+}
+
+@article{zurn2020network,
+  title={Network architectures supporting learnability},
+  author={Zurn, Perry and Bassett, Danielle S},
+  journal={Philosophical Transactions of the Royal Society B},
+  volume={375},
+  number={1796},
+  pages={20190323},
+  year={2020},
+  publisher={The Royal Society}
+}
+
+@book{moralia2005,
+  title={Moralia, Volume VI},
+  author={Plutarch, Helmbold, William},
+  year={1939},
+  publisher={Harvard University Press}
+}
+
+@book{bassett2022curious, 
+title={Curious Minds}, 
+author={Danielle S. Bassett and Perry Zurn},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@book{fake2022, 
+title={fake}, 
+author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso},
+publisher={MIT Press}, 
+year={2022},
+}
+
+@article{jurafsky2018n,
+  title={N-gram language models},
+  author={Jurafsky, D and Martin, JH},
+  journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
+  year={2018}
+}
+
+@article {Dworkin2020.01.03.894378,
+  author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.},
+  title = {The extent and drivers of gender imbalance in neuroscience reference lists},
+  elocation-id = {2020.01.03.894378},
+  year = {2020},
+  doi = {10.1101/2020.01.03.894378},
+  publisher = {Cold Spring Harbor Laboratory},
+  abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.},
+  URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378},
+  eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf},
+  journal = {bioRxiv}
+}
+
+@article{maliniak2013gender,
+  title={The gender citation gap in international relations},
+  author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F},
+  journal={International Organization},
+  volume={67},
+  number={4},
+  pages={889--922},
+  year={2013},
+  publisher={Cambridge University Press}
+}
+
+@article{caplar2017quantitative,
+  title={Quantitative evaluation of gender bias in astronomical publications from citation counts},
+  author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon},
+  journal={Nature Astronomy},
+  volume={1},
+  number={6},
+  pages={0141},
+  year={2017},
+  publisher={Nature Publishing Group}
+}
+
+@article{mitchell2013gendered,
+  title={Gendered citation patterns in international relations journals},
+  author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly},
+  journal={International Studies Perspectives},
+  volume={14},
+  number={4},
+  pages={485--492},
+  year={2013},
+  publisher={Blackwell Publishing Ltd Oxford, UK}
+}
+
+@article{dion2018gendered,
+  title={Gendered citation patterns across political science and social science methodology fields},
+  author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin},
+  journal={Political Analysis},
+  volume={26},
+  number={3},
+  pages={312--327},
+  year={2018},
+  publisher={Cambridge University Press}
+}
+
+@software{zhou_dale_2020_3672110,
+  author       = {Zhou, Dale and
+                  Cornblath, Eli J. and
+                  Stiso, Jennifer and
+                  Teich, Erin G. and
+                  Dworkin, Jordan D. and
+                  Blevins, Ann S. and
+                  Bassett, Danielle S.},
+  title        = {Gender Diversity Statement and Code Notebook v1.0},
+  month        = feb,
+  year         = 2020,
+  publisher    = {Zenodo},
+  version      = {v1.0},
+  doi          = {10.5281/zenodo.3672110},
+  url          = {https://doi.org/10.5281/zenodo.3672110}
+}
+
+@inproceedings{ambekar2009name,
+  title={Name-ethnicity classification from open sources},
+  author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven},
+  booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining},
+  pages={49--58},
+  year={2009}
+}
+
+@article{sood2018predicting,
+  title={Predicting race and ethnicity from the sequence of characters in a name},
+  author={Sood, Gaurav and Laohaprapanon, Suriyan},
+  journal={arXiv preprint arXiv:1805.02109},
+  year={2018}
+}
+
+@article{bertolero2021racial,
+title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender},
+author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and  Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.},
+journal = {bioRxiv},
+year = {2020},
+xoi = {10.1101/2020.10.12.336230},
+}
+
+@article{chatterjee2021gender,
+journal = {JAMA Netw Open},
+year = {2021},
+volume = {4},
+number = {7},
+pages = {e2114509},
+title = {Gender Disparity in Citations in High-Impact Journal Articles},
+author = {Chatterjee, Paula and Werner, Rachel M},
+}
+
+@article{fulvio2021imbalance,
+title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience},
+author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R},
+journal = {J Cogn Neurosci},
+year = {2021},
+volume = {33},
+number = {1},
+pages = {3-7},
+}
+
+@article{ethnicolr2022black,
+  title={Test of ethnicolr},
+  author={Washington, Denzel and Brown-Jackson, Ketanji},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022hispanic,
+  title={Test of ethnicolr},
+  author={Cruz, Rafael and Ocasio-Cortez, Alexandria},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022asian,
+  title={Test of ethnicolr},
+  author={Wang, Andrew and Yeoh, Michelle},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
+
+@article{ethnicolr2022white,
+  title={Test of ethnicolr},
+  author={Coppola, Nicolas and Streep, Meryl},
+  journal={Nature neuroscience},
+  volume={16},
+  number={2},
+  pages={130},
+  year={2013},
+  publisher={Nature Publishing Group}
+}
\ No newline at end of file

From f1988b9fbb5d95baf3cc8a58f48dd34f03ced3e3 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Wed, 11 May 2022 13:16:34 -0400
Subject: [PATCH 09/47] added function for finding unused citations in aux file

---
 __init__.py                                   |   0
 cleanBib.ipynb                                | 408 +-------
 tests/aux/checkcites.lua                      | 869 ++++++++++++++++++
 tests/{immaculate => aux}/cleanedBib.csv      |   0
 tests/{ => aux}/document.aux                  |   0
 tests/{ => aux}/pipeline.py                   |  20 +-
 tests/{ => aux}/testBib_immaculate.bib        |   0
 tests/cleanedBib.csv                          |   1 +
 tests/erroneous/pipeline.py                   |  37 +
 tests/{ => erroneous}/testBib_erroneous.bib   |   0
 tests/immaculate/cleanedBib_test.csv          |  15 +
 .../__pycache__/preprocessing.cpython-39.pyc  | Bin 0 -> 13102 bytes
 utils/__pycache__/queries.cpython-39.pyc      | Bin 0 -> 838 bytes
 utils/preprocessing.py                        | 150 ++-
 14 files changed, 1004 insertions(+), 496 deletions(-)
 create mode 100644 __init__.py
 create mode 100755 tests/aux/checkcites.lua
 rename tests/{immaculate => aux}/cleanedBib.csv (100%)
 rename tests/{ => aux}/document.aux (100%)
 rename tests/{ => aux}/pipeline.py (57%)
 rename tests/{ => aux}/testBib_immaculate.bib (100%)
 create mode 100644 tests/cleanedBib.csv
 create mode 100644 tests/erroneous/pipeline.py
 rename tests/{ => erroneous}/testBib_erroneous.bib (100%)
 create mode 100644 tests/immaculate/cleanedBib_test.csv
 create mode 100644 utils/__pycache__/preprocessing.cpython-39.pyc
 create mode 100644 utils/__pycache__/queries.cpython-39.pyc

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 2d7e005..a5f6cff 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -75,115 +75,6 @@
     "from pybtex.database import parse_file\n",
     "import seaborn as sns\n",
     "\n",
-    "\n",
-    "def checkcites_output(aux_file):\n",
-    "    '''take in aux file for tex document, return list of citation keys\n",
-    "    that are in .bib file but not in document'''\n",
-    "\n",
-    "    result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE)\n",
-    "    result = result.stdout.decode('utf-8')\n",
-    "    unused_array_raw = result.split('\\n')\n",
-    "    # process array of unused references + other output \n",
-    "    unused_array_final = list()\n",
-    "    for x in unused_array_raw:\n",
-    "        if len(x) > 0: # if line is not empty\n",
-    "            if x[0] == '-':  # and if first character is a '-', it's a citation key\n",
-    "                unused_array_final.append(x[2:]) # truncate '- '            \n",
-    "    if \"------------------------------------------------------------------------\" in unused_array_final:\n",
-    "        return(result)\n",
-    "    else:\n",
-    "        return(unused_array_final)\n",
-    "\n",
-    "\n",
-    "def removeMiddleName(line):\n",
-    "    arr = line.split()\n",
-    "    last = arr.pop()\n",
-    "    n = len(arr)\n",
-    "    if n == 4:\n",
-    "        first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])\n",
-    "    elif n == 3:\n",
-    "        first, middle = arr[0], ' '.join(arr[1:])\n",
-    "    elif n == 2:\n",
-    "        first, middle = arr\n",
-    "    elif n==1:\n",
-    "        return line\n",
-    "    return(str(first + ' ' + middle))\n",
-    "\n",
-    "\n",
-    "def returnFirstName(line):\n",
-    "    arr = line.split()\n",
-    "    n = len(arr)\n",
-    "    if n == 4:\n",
-    "        first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])\n",
-    "    elif n == 3:\n",
-    "        first, middle = arr[0], ' '.join(arr[1:])\n",
-    "    elif n == 2:\n",
-    "        first, middle = arr\n",
-    "    elif n==1:\n",
-    "        return line\n",
-    "    return(str(middle))\n",
-    "\n",
-    "\n",
-    "def convertLatexSpecialChars(latex_text):\n",
-    "    return LatexNodes2Text().latex_to_text(latex_text)\n",
-    "\n",
-    "\n",
-    "def convertSpecialCharsToUTF8(text):\n",
-    "    data = LatexNodes2Text().latex_to_text(text)\n",
-    "    return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')\n",
-    "\n",
-    "\n",
-    "def namesFromXref(doi, title, authorPos):\n",
-    "    '''Use DOI and article titles to query Crossref for author list'''\n",
-    "    if authorPos == 'first':\n",
-    "        idx = 0\n",
-    "    elif authorPos == 'last':\n",
-    "        idx = -1\n",
-    "    # get cross ref data\n",
-    "    authors = ['']\n",
-    "    # first try DOI\n",
-    "    if doi != \"\":\n",
-    "        works = cr.works(query = title, select = [\"DOI\",\"author\"], limit=1, filter = {'doi': doi})\n",
-    "        if works['message']['total-results'] > 0:\n",
-    "            authors = works['message']['items'][0]['author']\n",
-    "    elif title != '': \n",
-    "        works = cr.works(query = f'title:\"{title}\"', select = [\"title\",\"author\"], limit=10)\n",
-    "        cnt = 0\n",
-    "        name = ''\n",
-    "        # check that you grabbed the proper paper\n",
-    "        if works['message']['items'][cnt]['title'][0].lower() == title.lower():\n",
-    "            authors = works['message']['items'][0]['author']\n",
-    "\n",
-    "    # check the all fields are available\n",
-    "    if not 'given' in authors[idx]:\n",
-    "        name = ''\n",
-    "    else:\n",
-    "        # trim initials\n",
-    "        name = authors[idx]['given'].replace('.',' ').split()[0]\n",
-    "\n",
-    "    return name\n",
-    "\n",
-    "\n",
-    "def namesFromXrefSelfCite(doi, title):\n",
-    "    selfCiteCheck = 0\n",
-    "    # get cross ref data\n",
-    "    authors = ['']\n",
-    "    # first try DOI\n",
-    "    if doi != \"\":\n",
-    "        works = cr.works(query = title, select = [\"DOI\",\"author\"], limit=1, filter = {'doi': doi})\n",
-    "        if works['message']['total-results'] > 0:\n",
-    "            authors = works['message']['items'][0]['author']\n",
-    "    \n",
-    "    for i in authors:\n",
-    "        if i != \"\":\n",
-    "            first = i['given'].replace('.',' ').split()[0]\n",
-    "            last = i['family'].replace('.',' ').split()[0]\n",
-    "            authors = removeMiddleName(last + \", \" + first)\n",
-    "            if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):\n",
-    "                selfCiteCheck += 1\n",
-    "    return selfCiteCheck\n",
-    "\n",
-    "\n",
     "cr = Crossref()\n",
     "homedir = '/home/jovyan/'\n",
     "bib_files = glob.glob(homedir + '*.bib')\n",
@@ -243,296 +134,17 @@
     "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n",
     "checkingPublishedArticle = False\n",
     "\n",
-    "if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or (yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'):\n",
-    "    raise ValueError(\"Please enter your manuscript's first and last author names\")\n",
-    "\n",
     "if paper_aux_file:\n",
-    "    if optionalEqualContributors == ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']:\n",
-    "        citing_authors = np.array([yourFirstAuthor, yourLastAuthor])\n",
-    "    else:\n",
-    "        citing_authors = np.array([yourFirstAuthor, yourLastAuthor, optionalEqualContributors])\n",
-    "    print(checkcites_output(paper_aux_file))\n",
-    "    unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper\n",
-    "    print(\"Unused citations: \", unused_in_paper.count('=>'))\n",
-    "    \n",
-    "    \n",
-    "    parser = BibTexParser()\n",
-    "    parser.ignore_nonstandard_types = False\n",
-    "    parser.common_strings = True\n",
-    "    \n",
-    "    bib_data = None\n",
-    "    for bib_file in bib_files:\n",
-    "        with open(bib_file) as bibtex_file:\n",
-    "            if bib_data is None:\n",
-    "                bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n",
-    "            else:\n",
-    "                bib_data_extra = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n",
-    "                bib_data.entries_dict.update(bib_data_extra.entries_dict)\n",
-    "                bib_data.entries.extend(bib_data_extra.entries)\n",
-    "    \n",
-    "    all_library_citations = list(bib_data.entries_dict.keys())\n",
-    "    print(\"All citations: \", len(all_library_citations))\n",
-    "    \n",
-    "    for k in all_library_citations:\n",
-    "        if re.search('\\\\b'+ k + '\\\\b', unused_in_paper.replace('\\n',' ').replace('=>',' ')) != None:\n",
-    "            del bib_data.entries_dict[k] # remove from entries dictionary if not in paper\n",
-    "            \n",
-    "    in_paper_mask = [re.search('\\\\b'+ bib_data.entries[x]['ID'] + '\\\\b', unused_in_paper.replace('\\n',' ').replace('=>',' ')) == None for x in range(len(bib_data.entries))]\n",
-    "    bib_data.entries = [bib_data.entries[x] for x in np.where(in_paper_mask)[0]] # replace entries list with entries only in paper\n",
-    "    del bib_data.comments\n",
-    "    \n",
-    "    duplicates = []\n",
-    "    for key in bib_data.entries_dict.keys():\n",
-    "        count = str(bib_data.entries).count(\"'ID\\': \\'\"+ key + \"\\'\")\n",
-    "        if count > 1:\n",
-    "            duplicates.append(key)\n",
-    "            \n",
-    "    if len(duplicates) > 0:\n",
-    "        raise ValueError(\"In your .bib file, please remove duplicate entries or duplicate entry ID keys for:\", ' '.join(map(str, duplicates)))\n",
-    "\n",
-    "    if os.path.exists(paper_bib_file):\n",
-    "        os.remove(paper_bib_file)\n",
-    "    \n",
-    "    with open(paper_bib_file, 'w') as bibtex_file:\n",
-    "        bibtexparser.dump(bib_data, bibtex_file)\n",
-    "    \n",
-    "    # define first author and last author names of citing paper -- will exclude citations of these authors\n",
-    "    # beware of latex symbols within author names\n",
-    "    # in_paper_citations = list(bib_data.entries_dict.keys())\n",
-    "    in_paper_citations = [bib_data.entries[x]['ID'] for x in range(len(bib_data.entries))] # get list of citation keys in paper\n",
-    "    \n",
-    "    # extract author list for every cited paper\n",
-    "    cited_authors = [bib_data.entries_dict[x]['author'] for x in in_paper_citations]\n",
-    "    # find citing authors in cited author list\n",
-    "    # using nested list comprehension, make a citing author -by- citation array of inclusion\n",
-    "    self_cite_mask = np.array([[str(citing_author) in authors for authors in cited_authors] for citing_author in citing_authors])\n",
-    "    self_cite_mask = np.any(self_cite_mask,axis=0) # collapse across citing authors such that any coauthorship by either citing author -> exclusion\n",
-    "    \n",
-    "    print(\"Self-citations: \", [bib_data.entries[x]['ID'] for x in np.where(self_cite_mask)[0]]) # print self citations\n",
-    "    for idx,k in enumerate(in_paper_citations):\n",
-    "        if self_cite_mask[idx]:\n",
-    "            del bib_data.entries_dict[k] # delete citation from dictionary if self citationi\n",
-    "    bib_data.entries = [bib_data.entries[x] for x in np.where(np.invert(self_cite_mask))[0]] # replace entries list with entries that aren't self citations\n",
-    "    \n",
-    "    paper_bib_file_excl_sc = os.path.splitext(paper_bib_file)[0] + '_noselfcite.bib'\n",
-    "    \n",
-    "    if os.path.exists(paper_bib_file_excl_sc):\n",
-    "        os.remove(paper_bib_file_excl_sc)\n",
-    "    \n",
-    "    with open(paper_bib_file_excl_sc, 'w') as bibtex_file:\n",
-    "        bibtexparser.dump(bib_data, bibtex_file)\n",
-    "        \n",
-    "    ID = glob.glob(homedir + paper_bib_file_excl_sc)\n",
-    "else:\n",
-    "    ID = glob.glob(homedir + '*bib')\n",
-    "    with open(ID[0]) as bibtex_file:\n",
-    "        bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n",
-    "    duplicates = []\n",
-    "    for key in bib_data.entries_dict.keys():\n",
-    "        count = str(bib_data.entries).count(\"'ID\\': \\'\"+ key + \"\\'\")\n",
-    "        if count > 1:\n",
-    "            duplicates.append(key)\n",
-    "            \n",
-    "    if len(duplicates) > 0:\n",
-    "        raise ValueError(\"In your .bib file, please remove duplicate entries or duplicate entry ID keys for:\", ' '.join(map(str, duplicates)))\n",
-    "\n",
-    "if checkingPublishedArticle == True:\n",
-    "    FA = []\n",
-    "    LA = []\n",
-    "    counter = 1\n",
-    "    selfCiteCount = 0\n",
-    "    titleCount = 1 # \n",
-    "    counterNoDOI = list() # row index (titleCount) of entries with no DOI\n",
-    "    outPath = homedir + 'cleanedBib.csv'\n",
-    "\n",
-    "    if os.path.exists(outPath):\n",
-    "        os.remove(outPath)\n",
-    "\n",
-    "    with open(outPath, 'w', newline='') as csvfile:\n",
-    "        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
-    "        writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])\n",
-    "    \n",
-    "    citedArticleDOI = list()\n",
-    "    citedArticleNoDOI = list()\n",
-    "    allArticles = list()\n",
-    "    for entry in bib_data.entries:\n",
-    "        my_string= entry['cited-references'].split('\\n')\n",
-    "        for citedArticle in my_string:\n",
-    "            allArticles.append(citedArticle)\n",
-    "            if citedArticle.partition(\"DOI \")[-1]=='':\n",
-    "                citedArticleNoDOI.append(citedArticle)\n",
-    "                counterNoDOI.append(titleCount)\n",
-    "            else:\n",
-    "                line = citedArticle.partition(\"DOI \")[-1].replace(\"DOI \",\"\").rstrip(\".\")\n",
-    "                line = ''.join( c for c in line if  c not in '{[}] ')\n",
-    "                if \",\" in line:\n",
-    "                    line = line.partition(\",\")[-1]\n",
-    "                citedArticleDOI.append(line)\n",
-    "                with open('citedArticlesDOI.csv', 'a', newline='') as csvfile:\n",
-    "                    writer = csv.writer(csvfile, delimiter=',')\n",
-    "                    writer.writerow([line])\n",
-    "            titleCount += 1\n",
-    "\n",
-    "    articleNum = 0\n",
-    "    for doi in citedArticleDOI:\n",
-    "        try:\n",
-    "            FA = namesFromXref(doi, '', 'first')\n",
-    "        except UnboundLocalError:\n",
-    "            sleep(1)\n",
-    "            continue\n",
-    "\n",
-    "        try:\n",
-    "            LA = namesFromXref(doi, '', 'last')\n",
-    "        except UnboundLocalError:\n",
-    "            sleep(1)\n",
-    "            continue\n",
-    "\n",
-    "        try:\n",
-    "            selfCiteCount = namesFromXrefSelfCite(doi, '')\n",
-    "        except UnboundLocalError:\n",
-    "            sleep(1)\n",
-    "            continue\n",
-    "\n",
-    "        with open(outPath, 'a', newline='') as csvfile:            \n",
-    "            if selfCiteCount == 0:\n",
-    "                writer = csv.writer(csvfile, delimiter=',')\n",
-    "                getArticleIndex = [i for i, s in enumerate(allArticles) if doi in s]\n",
-    "                writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), allArticles[[i for i, s in enumerate(allArticles) if doi in s][0]], '', ''])\n",
-    "                print(str(counter) + \": \" + doi )\n",
-    "                counter += 1\n",
-    "            else:\n",
-    "                print(str(articleNum) + \": \" + doi + \"\\t\\t\\t <-- self-citation\" )\n",
-    "        articleNum += 1\n",
-    "\n",
-    "    if len(citedArticleNoDOI)>0:\n",
-    "        print()\n",
-    "        for elem in citedArticleNoDOI:\n",
-    "            with open(outPath, 'a', newline='') as csvfile:            \n",
-    "                writer = csv.writer(csvfile, delimiter=',')\n",
-    "                writer.writerow([counter, '', '', elem, '', ''])\n",
-    "                print(str(counter) + \": \" + elem )\n",
-    "            counter += 1\n",
-    "        print()\n",
-    "        raise ValueError(\"WARNING: No article DOI was provided for the last \" + str(len(citedArticleNoDOI)) + \" listed papers. Please manually search for these articles. IF AND ONLY IF your citing paper's first and last author are not co-authors in the paper that was cited, enter the first name of the first and last authors of the paper that was cited manually. Then, continue to the next code block.\")\n",
+    "    find_unused_cites(paper_aux_file)\n",
+    "\n",
+    "bib_data = get_bib_data(homedir)\n",
+    "if checkingPublishedArticle:\n",
+    "    get_names_published(homedir, bib_data, cr)\n",
     "else:\n",
-    "    FA = []\n",
-    "    LA = []\n",
-    "    parser = bibtex.Parser()\n",
-    "    bib_data = parser.parse_file(ID[0])\n",
-    "    counter = 1\n",
-    "    nameCount = 0\n",
-    "    outPath = homedir + 'cleanedBib.csv'\n",
-    "\n",
-    "    if os.path.exists(outPath):\n",
-    "        os.remove(outPath)\n",
-    "\n",
-    "    with open(outPath, 'w', newline='') as csvfile:\n",
-    "        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
-    "        writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])\n",
-    "\n",
-    "    for key in bib_data.entries.keys():\n",
-    "        diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',\n",
-    "                                'The gender citation gap in international relations',\n",
-    "                                'Quantitative evaluation of gender bias in astronomical publications from citation counts',\n",
-    "                                '\\# CommunicationSoWhite',\n",
-    "                                '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',\n",
-    "                                'Gendered citation patterns across political science and social science methodology fields',\n",
-    "                                'Gender Diversity Statement and Code Notebook v1.0', \n",
-    "                                'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender', \n",
-    "                                'Gender Diversity Statement and Code Notebook v1.1',\n",
-    "                                'Gendered citation practices in the field of communication',\n",
-    "                                'Gender disparity in citations in high- impact journal articles',\n",
-    "                                'Gender (im)balance in citation practices in cognitive neuroscience',\n",
-    "                                'Name-ethnicity classification from open sources',\n",
-    "                                'Predicting race and ethnicity from the sequence of characters in a name']\n",
-    "        if bib_data.entries[key].fields['title'] in diversity_bib_titles:\n",
-    "            continue\n",
-    "\n",
-    "        try:\n",
-    "            author = bib_data.entries[key].persons['author']\n",
-    "        except:\n",
-    "            author = bib_data.entries[key].persons['editor']\n",
-    "        FA = author[0].rich_first_names\n",
-    "        LA = author[-1].rich_first_names\n",
-    "        FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n",
-    "        LA = convertLatexSpecialChars(str(LA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n",
-    "        \n",
-    "        # check if we grabbed a first initial when a full middle name was available\n",
-    "        if (len(FA) == 1):\n",
-    "            mn = author[0].rich_middle_names\n",
-    "            mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n",
-    "            if len(mn) > 1:\n",
-    "                FA = mn\n",
-    "        if (len(LA) == 1):\n",
-    "            mn = author[-1].rich_middle_names\n",
-    "            mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n",
-    "            if len(mn) > 1:\n",
-    "                LA = mn\n",
-    "        \n",
-    "        # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API\n",
-    "        try:\n",
-    "            title = bib_data.entries[key].fields['title'].replace(',', '').replace(',', '').replace('{','').replace('}','')\n",
-    "        except:\n",
-    "            title = ''\n",
-    "        try:\n",
-    "            doi =  bib_data.entries[key].fields['doi']\n",
-    "        except:\n",
-    "            doi = ''\n",
-    "        if FA == '' or len(FA.split('.')[0]) <= 1:\n",
-    "            while True:\n",
-    "                try:\n",
-    "                    FA = namesFromXref(doi, title, 'first')\n",
-    "                except UnboundLocalError:\n",
-    "                    sleep(1)\n",
-    "                    continue\n",
-    "                break\n",
-    "        if LA == '' or len(LA.split('.')[0]) <= 1:\n",
-    "            while True:\n",
-    "                try:\n",
-    "                    LA = namesFromXref(doi, title, 'last')\n",
-    "                except UnboundLocalError:\n",
-    "                    sleep(1)\n",
-    "                    continue\n",
-    "                break\n",
-    "\n",
-    "        if (yourFirstAuthor!='LastName, FirstName OptionalMiddleInitial') and (yourLastAuthor!='LastName, FirstName OptionalMiddleInitial'):\n",
-    "            selfCiteCheck1 = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "            selfCiteCheck1a = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "            selfCiteCheck1b = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), LA]).replace(\"'\", \"\")]\n",
-    "\n",
-    "            selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "            selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "            selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), FA]).replace(\"'\", \"\")]\n",
-    "\n",
-    "            nameCount = 0\n",
-    "            if optionalEqualContributors != ('LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):\n",
-    "                for name in optionalEqualContributors:\n",
-    "                    selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "                    selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n",
-    "                    if len(selfCiteCheck3)>0:\n",
-    "                        nameCount += 1\n",
-    "                    if len(selfCiteCheck3a)>0:\n",
-    "                        nameCount += 1\n",
-    "            selfCiteChecks = [selfCiteCheck1, selfCiteCheck1a, selfCiteCheck1b, selfCiteCheck2, selfCiteCheck2a, selfCiteCheck2b]\n",
-    "            if sum([len(check) for check in selfCiteChecks]) + nameCount > 0:\n",
-    "                selfCite = 'Y'\n",
-    "                if len(FA) < 2:\n",
-    "                    print(str(counter) + \": \" + key + \"\\t\\t  <-- self-citation <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\")\n",
-    "                else:\n",
-    "                    print(str(counter) + \": \" + key + \"  <-- self-citation\")\n",
-    "            else:\n",
-    "                selfCite= 'N'\n",
-    "                if len(FA) < 2:\n",
-    "                    print(str(counter) + \": \" + key + \"\\t\\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\")\n",
-    "                else:\n",
-    "                    print(str(counter) + \": \" + key)\n",
-    "        else:\n",
-    "            selfCite = 'NA'\n",
-    "\n",
-    "        with open(outPath, 'a', newline='') as csvfile:\n",
-    "            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
-    "            writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key])\n",
-    "        counter += 1"
+    "    # find and print duplicates\n",
+    "    get_duplicates(bib_data)\n",
+    "    # get names, remove CDS, find self cites\n",
+    "    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)"
    ]
   },
   {
@@ -1111,4 +723,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file
diff --git a/tests/aux/checkcites.lua b/tests/aux/checkcites.lua
new file mode 100755
index 0000000..6191ca2
--- /dev/null
+++ b/tests/aux/checkcites.lua
@@ -0,0 +1,869 @@
+#!/usr/bin/env texlua
+-- -----------------------------------------------------------------
+-- checkcites.lua
+-- Copyright 2012, 2019, Enrico Gregorio, Paulo Roberto Massa Cereda
+--
+-- This work may be distributed and/or modified under the conditions
+-- of the LaTeX  Project Public License, either version  1.3 of this
+-- license or (at your option) any later version.
+--
+-- The latest version of this license is in
+--
+-- http://www.latex-project.org/lppl.txt
+--
+-- and version  1.3 or later is  part of all distributions  of LaTeX
+-- version 2005/12/01 or later.
+--
+-- This  work  has the  LPPL  maintenance  status `maintained'.  the
+-- current maintainers of  this work are the  original authors. This
+-- work consists of the file checkcites.lua.
+--
+-- Project repository: http://github.com/cereda/checkcites
+-- -----------------------------------------------------------------
+
+-- Checks if the table contains the element.
+-- @param a Table.
+-- @param hit Element.
+-- @return Boolean value if the table contains the element.
+local function exists(a, hit)
+  for _, v in ipairs(a) do
+    if v == hit then
+      return true
+    end
+  end
+  return false
+end
+
+-- Parses the list of arguments based on a configuration map.
+-- @param map Configuration map.
+-- @param args List of command line arguments.
+-- @return Table containing the valid keys and entries.
+-- @return Table containing the invalid keys.
+local function parse(map, args)
+  local keys, key, unknown = {}, 'unpaired', {}
+  local a, b
+  for _, v in ipairs(args) do
+    a, _, b = string.find(v, '^%-(%w)$')
+    if a then
+      for _, x in ipairs(map) do
+        key = 'unpaired'
+        if x['short'] == b then
+          key = x['long']
+          break
+        end
+      end
+      if key == 'unpaired' then
+        table.insert(unknown, '-' .. b)
+      end
+      if not keys[key] then
+        keys[key] = {}
+      end
+    else
+      a, _, b = string.find(v, '^%-%-([%w-]+)$')
+      if a then
+        for _, x in ipairs(map) do
+          key = 'unpaired'
+          if x['long'] == b then
+            key = b
+            break
+          end
+        end
+        if key == 'unpaired' then
+          if not exists(unknown, '--' .. b) then
+            table.insert(unknown, '--' .. b)
+          end
+        end
+        if not keys[key] then
+          keys[key] = {}
+        end
+      else
+        if not keys[key] then
+          keys[key] = {}
+        end
+        if key ~= 'unpaired' then
+          for _, x in ipairs(map) do
+            if x['long'] == key then
+              if not (x['argument'] and
+                 #keys[key] == 0) then
+                key = 'unpaired'
+              end
+              break
+            end
+          end
+          if not keys[key] then
+            keys[key] = {}
+          end
+          table.insert(keys[key], v)
+        else
+          if not keys[key] then
+            keys[key] = {}
+          end
+          table.insert(keys[key], v)
+        end
+      end
+    end
+  end
+  return keys, unknown
+end
+
+-- Calculates the difference between two tables.
+-- @param a First table.
+-- @param b Second table.
+-- @return Table containing the difference between two tables.
+local function difference(a, b)
+  local result = {}
+  for _, v in ipairs(a) do
+    if not exists(b, v) then
+      table.insert(result, v)
+    end
+  end
+  return result
+end
+
+-- Splits the string based on a pattern.
+-- @param str String.
+-- @param pattern Pattern.
+local function split(str, pattern)
+  local result = {}
+  string.gsub(str, pattern, function(a)
+              table.insert(result, a) end)
+  return result
+end
+
+-- Reads lines from a file.
+-- @param file File.
+-- @returns Table representing the lines.
+local function read(file)
+  local handler = io.open(file, 'r')
+  local lines = {}
+  if handler then
+    for line in handler:lines() do
+      table.insert(lines, line)
+    end
+    handler:close()
+  end
+  return lines
+end
+
+-- Gets a pluralized word based on a counter.
+-- @param i Counter.
+-- @param a Word in singular.
+-- @param b Word in plural.
+-- @return Either the first or second word based on the counter.
+local function plural(i, a, b)
+  if i == 1 then
+    return a
+  else
+    return b
+  end
+end
+
+-- Normalizes the string, removing leading and trailing spaces.
+-- @param str String.
+-- @return Normalized string without leading and trailing spaces.
+local function normalize(str)
+  local result, _ = string.gsub(str, '^%s', '')
+  result, _ = string.gsub(result, '%s$', '')
+  return result
+end
+
+-- Checks if the element is in a blacklist.
+-- @param a Element.
+-- @return Boolean value if the element is blacklisted.
+local function blacklist(a)
+  local list = {}
+  for _, v in ipairs(list) do
+    if v == a then
+      return true
+    end
+  end
+  return false
+end
+
+-- Checks if the key is allowed.
+-- @param v The key itself.
+-- @return Boolean value if the key is allowed.
+local function allowed(key)
+  local keys = { 'string', 'comment' }
+  for _, v in ipairs(keys) do
+    if string.lower(key) == v then
+      return false
+    end
+  end
+  return true
+end
+
+-- Extracts the biblographic key.
+-- @param lines Lines of a file.
+-- @return Table containing bibliographic keys.
+local function extract(lines)
+  local result = {}
+  for _, line in ipairs(lines) do
+    local key, hit = string.match(line,
+                '^%s*%@(%w+%s*){%s*(.+),')
+    if key and allowed(key) then
+      if not exists(result, hit) then
+        hit = normalize(hit)
+        table.insert(result, hit)
+      end
+    end
+  end
+  return result
+end
+
+-- Extracts the cross-references found
+-- in lines of the bibligraphy file.
+-- @param lines Line of a file.
+-- @return Table containing cross-references.
+local function crossref(lines)
+  local result, lookup, key, hit = {}, ''
+  for _, line in ipairs(lines) do
+     key, hit = string.match(line,
+                '^%s*%@(%w+%s*){%s*(.+),')
+    if key and allowed(key) then
+      lookup = normalize(hit)
+    else
+      key, hit = string.match(line,
+                 '^%s*(%w+)%s*=%s*(.+)$')
+      if key then
+        key = string.lower(key)
+        if key == 'crossref' then
+          if string.sub(hit, -1) == ',' then
+            hit = string.sub(hit, 2, -3)
+          else
+            hit = string.sub(hit, 2, -2)
+          end
+          result[lookup] = hit
+        end
+      end
+    end
+  end
+  return result
+end
+
+-- Adds the extension if the file does not have it.
+-- @param file File.
+-- @param extension Extension.
+-- @return File with proper extension.
+local function sanitize(file, extension)
+  extension = '.' .. extension
+  if string.sub(file, -#extension) ~= extension then
+    file = file .. extension
+  end
+  return file
+end
+
+-- Checks if a file exists.
+-- @param file File.
+-- @return Boolean value indicating if the file exists.
+local function valid(file)
+  local handler = io.open(file, 'r')
+  if handler then
+    handler:close()
+    return true
+  else
+    return false
+  end
+end
+
+-- Wraps a string based on a line width.
+-- @param str String.
+-- @param size Line width.
+-- @return Wrapped string.
+local function wrap(str, size)
+  local parts = split(str, '[^%s]+')
+  local r, l = '', ''
+  for _, v in ipairs(parts) do
+    if (#l + #v) > size then
+      r = r .. '\n' .. l
+      l = v
+    else
+      l = normalize(l .. ' ' .. v)
+    end
+  end
+  r = normalize(r .. '\n' .. l)
+  return r
+end
+
+-- Backend namespace
+local backends = {}
+
+-- Gets data from auxiliary files (BibTeX).
+-- @param lines Lines of a file.
+-- @param rec Recursive switch.
+-- @return Boolean indicating if an asterisk was found.
+-- @return Table containing the citations.
+-- @return Table containing the bibliography files.
+backends.bibtex = function(lines, rec)
+  local citations, bibliography, invalid = {}, {}, {}
+  local asterisk, parts, hit = false
+  for _, line in ipairs(lines) do
+    hit = string.match(line, '^%s*\\citation{(.+)}$')
+    if hit then
+      if hit ~= '*' then
+        parts = split(hit, '[^,%s]+')
+        for _, v in ipairs(parts) do
+          v = normalize(v)
+          if not exists(citations, v) then
+            table.insert(citations, v)
+          end
+        end
+      else
+        asterisk = true
+      end
+    else
+      hit = string.match(line, '^%s*\\bibdata{(.+)}$')
+      if hit then
+        parts = split(hit, '[^,%s]+')
+        for _, v in ipairs(parts) do
+          v = normalize(v)
+          if not exists(bibliography, v) and
+             not blacklist(v) then
+            table.insert(bibliography, v)
+          end
+        end
+      else
+        hit = string.match(line, '^%s*\\@input{(.+)}$')
+        if rec and hit then
+          hit = sanitize(hit, 'aux')
+          if not valid(hit) then
+            table.insert(invalid, hit)
+          else
+            local a, b, c = backends.bibtex(read(hit), false)
+            asterisk = asterisk or a
+            for _, v in ipairs(b) do
+              if not exists(citations, v) then
+                table.insert(citations, v)
+              end
+            end
+            for _, v in ipairs(c) do
+              if not exists(bibliography, v) then
+                table.insert(bibliography, v)
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+  if #invalid ~= 0 then
+    print()
+    print(wrap('Warning: there ' .. plural(#invalid,
+               'is an invalid reference ', 'are ' ..
+               'invalid references ') .. 'to the ' ..
+               'following auxiliary ' .. plural(#invalid,
+               'file ', 'files ') .. 'that could not ' ..
+               'be resolved at runtime:', 74))
+    for _, v in ipairs(invalid) do
+      print('=> ' .. v)
+    end
+  end
+  return asterisk, citations, bibliography
+end
+
+-- Gets data from auxiliary files (Biber).
+-- @param lines Lines of a file.
+-- @param _ To be discarded with biber.
+-- @return Boolean indicating if an asterisk was found.
+-- @return Table containing the citations.
+-- @return Table containing the bibliography files.
+backends.biber = function(lines, _)
+  local citations, bibliography = {}, {}
+  local asterisk, parts, hit = false
+  for _, line in ipairs(lines) do
+    hit = string.match(line, '^%s*<bcf:citekey order="%d+">' ..
+          '(.+)</bcf:citekey>$')
+    if hit then
+      if hit ~= '*' then
+        parts = split(hit, '[^,%s]+')
+        for _, v in ipairs(parts) do
+          v = normalize(v)
+          if not exists(citations, v) then
+            table.insert(citations, v)
+          end
+        end
+      else
+        asterisk = true
+      end
+    else
+      hit = string.match(line, '^%s*<bcf:datasource type="file" ' ..
+            'datatype="%w+">(.+)</bcf:datasource>$')
+      if hit then
+        parts = split(hit, '[^,%s]+')
+        for _, v in ipairs(parts) do
+          v = normalize(v)
+          if not exists(bibliography, v) and
+             not blacklist(v) then
+            table.insert(bibliography, v)
+          end
+        end
+      end
+    end
+  end
+  return asterisk, citations, bibliography
+end
+
+-- Counts the number of elements of a nominal table.
+-- @param t Table.
+-- @return Table size.
+local function count(t)
+  local counter = 0
+  for _, _ in pairs(t) do
+    counter = counter + 1
+  end
+  return counter
+end
+
+-- Repeats the provided char a certain number of times.
+-- @param c Char.
+-- @param size Number of times.
+-- @return String with a char repeated a certain number of times.
+local function pad(c, size)
+  local r = c
+  while #r < size do
+    r = r .. c
+  end
+  return r
+end
+
+-- Flattens a table of tables into only one table.
+-- @param t Table.
+-- @return Flattened table.
+local function flatten(t)
+  local result = {}
+  for _, v in ipairs(t) do
+    for _, k in ipairs(v) do
+      if not exists(result, k) then
+        table.insert(result, k)
+      end
+    end
+  end
+  return result
+end
+
+-- Organizes a key/value table of tables into only one table.
+-- @param t Table.
+-- @return Flattened key/value table.
+local function organize(t)
+  local result = {}
+  for _, v in ipairs(t) do
+    for j, k in pairs(v) do
+      if not result[j] then
+        result[j] = k
+      end
+    end
+  end
+  return result
+end
+
+-- Applies a function to elements of a table.
+-- @param c Table.
+-- @param f Function.
+-- @return A new table.
+local function apply(c, f)
+  local result = {}
+  for _, v in ipairs(c) do
+    table.insert(result, f(v))
+  end
+  return result
+end
+
+-- Search the TeX tree for the file.
+-- @param library The library reference.
+-- @param file The filename.
+-- @param extension The extension.
+-- @return String pointing to the file location.
+local function lookup(library, file, extension)
+  return library.find_file(file, extension)
+end
+
+-- Prints the script header.
+local function header()
+print("     _           _       _ _")
+print(" ___| |_ ___ ___| |_ ___|_| |_ ___ ___")
+print("|  _|   | -_|  _| '_|  _| |  _| -_|_ -|")
+print("|___|_|_|___|___|_,_|___|_|_| |___|___|")
+print()
+  print(wrap('checkcites.lua -- a reference ' ..
+             'checker script (v2.4)', 74))
+  print(wrap('Copyright (c) 2012, 2019, ' ..
+             'Enrico Gregorio, Paulo ' ..
+             'Roberto Massa Cereda', 74))
+end
+
+-- Operation namespace
+local operations = {}
+
+-- Reports the unused references.
+-- @param citations Citations.
+-- @param references References.
+-- @return Integer representing the status.
+operations.unused = function(citations, references, crossrefs)
+  print()
+  print(pad('-', 74))
+  print(wrap('Report of unused references in your TeX ' ..
+             'document (that is, references present in ' ..
+             'bibliography files, but not cited in ' ..
+             'the TeX source file)', 74))
+  print(pad('-', 74))
+
+  local z = {}
+  for _, citation in ipairs(citations) do
+    if crossrefs[citation] then
+      table.insert(z, crossrefs[citation])
+    end
+  end
+
+  for _, i in ipairs(z) do
+    if not exists(i, citations) then
+      table.insert(citations, i)
+    end
+  end
+
+  local r = difference(references, citations)
+  print()
+  print(wrap('Unused references in your TeX document: ' ..
+             tostring(#r), 74))
+  if #r == 0 then
+    return 0
+  else
+    for _, v in ipairs(r) do
+      print('=> ' .. v)
+    end
+    return 1
+  end
+end
+
+-- Reports the undefined references.
+-- @param citations Citations.
+-- @param references References.
+-- @return Integer value indicating the status.
+operations.undefined = function(citations, references, crossrefs)
+  print()
+  print(pad('-', 74))
+  print(wrap('Report of undefined references in your TeX ' ..
+             'document (that is, references cited in the ' ..
+             'TeX source file, but not present in the ' ..
+             'bibliography files)', 74))
+  print(pad('-', 74))
+
+  local z = {}
+  for _, citation in ipairs(citations) do
+    if crossrefs[citation] then
+      table.insert(z, crossrefs[citation])
+    end
+  end
+
+  for _, i in ipairs(z) do
+    if not exists(i, citations) then
+      table.insert(citations, i)
+    end
+  end
+
+  local r = difference(citations, references)
+  print()
+  print(wrap('Undefined references in your TeX document: ' ..
+        tostring(#r), 74))
+  if #r == 0 then
+    return 0
+  else
+    for _, v in ipairs(r) do
+      print('=> ' .. v)
+    end
+    return 1
+  end
+end
+
+-- Reports both unused and undefined references.
+-- @param citations Citations.
+-- @param references References.
+-- @return Integer value indicating the status.
+operations.all = function(citations, references, crossrefs)
+  local x, y
+  x = operations.unused(citations, references, crossrefs)
+  y = operations.undefined(citations, references, crossrefs)
+  if x + y > 0 then
+    return 1
+  else
+    return 0
+  end
+end
+
+-- Filters a table of files, keeping the inexistent ones.
+-- @param files Table.
+-- @param lib Search library.
+-- @param enabled Boolean switch to enable lookup.
+-- @param extension Extension for lookup.
+-- @return Table of inexistent files.
+-- @return Table of existent files.
+local function validate(files, lib, enabled, extension)
+  local bad, good = {}, {}
+  for _, v in ipairs(files) do
+    if not valid(v) then
+      if enabled and lookup(lib, v, extension) then
+        table.insert(good, lookup(lib, v, extension))
+      else
+        table.insert(bad, v)
+      end
+    else
+      table.insert(good, v)
+    end
+  end
+  return bad, good
+end
+
+-- Main function.
+-- @param args Command line arguments.
+-- @return Integer value indicating the status
+local function checkcites(args)
+
+  local kpse = require('kpse')
+  kpse.set_program_name('texlua')
+
+  header()
+
+  local parameters = {
+    { short = 'a', long = 'all', argument = false },
+    { short = 'u', long = 'unused', argument = false },
+    { short = 'U', long = 'undefined', argument = false },
+    { short = 'v', long = 'version', argument = false },
+    { short = 'h', long = 'help', argument = false },
+    { short = 'c', long = 'crossrefs', argument = false },
+    { short = 'b', long = 'backend', argument = true }
+  }
+
+  local keys, err = parse(parameters, args)
+  local check, backend = 'all', 'bibtex'
+
+  if #err ~= 0 then
+    print()
+    print(pad('-', 74))
+    print(wrap('I am sorry, but I do not recognize ' ..
+               'the following ' .. plural(#err, 'option',
+               'options') .. ':', 74))
+    for _, v in ipairs(err) do
+      print('=> ' .. v)
+    end
+
+    print()
+    print(wrap('Please make sure to use the correct ' ..
+               'options when running this script. You ' ..
+               'can also refer to the user documentation ' ..
+               'for a list of valid options. The script ' ..
+               'will end now.', 74))
+    return 1
+  end
+
+  if count(keys) == 0 then
+    print()
+    print(pad('-', 74))
+    print(wrap('I am sorry, but you have not provided ' ..
+               'any command line argument, including ' ..
+               'files to check and options. Make ' ..
+               'sure to invoke the script with the actual ' ..
+               'arguments. Refer to the user documentation ' ..
+               'if you are unsure of how this tool ' ..
+               'works. The script will end now.', 74))
+    return 1
+  end
+
+  if keys['version'] or keys['help'] then
+    if keys['version'] then
+      print()
+      print(wrap('checkcites.lua, version 2.4 (dated September ' ..
+                 '3, 2019)', 74))
+
+      print(pad('-', 74))
+      print(wrap('You can find more details about this ' ..
+                 'script, as well as the user documentation, ' ..
+                 'in the official source code repository:', 74))
+
+      print()
+      print('https://github.com/cereda/checkcites')
+
+      print()
+      print(wrap('The checkcites.lua script is licensed ' ..
+                 'under the LaTeX Project Public License, ' ..
+                 'version 1.3. The current maintainers ' ..
+                 'are the original authors.', 74))
+    else
+      print()
+      print(wrap('Usage: ' .. args[0] .. ' [ [ --all | --unused | ' ..
+                 '--undefined ] [ --backend <arg> ] <file> [ ' ..
+                 '<file 2> ... <file n> ] | --help | --version ' ..
+                 ']', 74))
+
+      print()
+      print('-a,--all           list all unused and undefined references')
+      print('-u,--unused        list only unused references in your bibliography files')
+      print('-U,--undefined     list only undefined references in your TeX source file')
+      print('-c,--crossrefs     enable cross-reference checks (disabled by default)')
+      print('-b,--backend <arg> set the backend-based file lookup policy')
+      print('-h,--help          print the help message')
+      print('-v,--version       print the script version')
+
+      print()
+      print(wrap('Unless specified, the script lists all unused and ' ..
+                 'undefined references by default. Also, the default ' ..
+                 'backend is set to "bibtex". Please refer to the user ' ..
+                 'documentation for more details.', 74))
+    end
+    return 0
+  end
+
+  if not keys['unpaired'] then
+    print()
+    print(pad('-', 74))
+    print(wrap('I am sorry, but you have not provided ' ..
+               'files to process. The tool requires ' ..
+               'least one file in order to properly ' ..
+               'work. Make sure to invoke the script ' ..
+               'with an actual file (or files). Refer ' ..
+               'to the user documentation if you are ' ..
+               'unsure of how this tool works. The ' ..
+               'script will end now.', 74))
+    return 1
+  end
+
+  if keys['backend'] then
+    if not exists({ 'bibtex', 'biber' }, keys['backend'][1]) then
+      print()
+      print(pad('-', 74))
+      print(wrap('I am sorry, but you provided an ' ..
+                 'invalid backend. I know two: ' ..
+                 '"bibtex" (which is the default ' ..
+                 'one) and "biber". Please make ' ..
+                 'sure to select one of the two. ' ..
+                 'Also refer to the user documentation ' ..
+                 'for more information on how these ' ..
+                 'backends work. The script will end ' ..
+                 'now.', 74))
+      return 1
+    else
+      backend = keys['backend'][1]
+    end
+  end
+
+  if not keys['all'] then
+    if keys['unused'] and keys['undefined'] then
+      check = 'all'
+    elseif keys['unused'] or keys['undefined'] then
+      check = (keys['unused'] and 'unused') or
+              (keys['undefined'] and 'undefined')
+    end
+  end
+
+  local auxiliary = apply(keys['unpaired'], function(a)
+                    return sanitize(a, (backend == 'bibtex'
+                    and 'aux') or 'bcf') end)
+
+  local invalid, _ = validate(auxiliary, kpse, false, 'aux')
+  if #invalid ~= 0 then
+    print()
+    print(pad('-', 74))
+    print(wrap('I am sorry, but I was unable to ' ..
+               'locate ' .. plural(#invalid, 'this file',
+               'these files')  .. ' (the extension ' ..
+               'is automatically set based on the ' ..
+               '"' .. backend .. '" backend):', 74))
+    for _, v in ipairs(invalid) do
+      print('=> ' .. v)
+    end
+
+    print()
+    print(wrap('Selected backend: ' .. backend, 74))
+    print(wrap('File lookup policy: add ".' ..
+               ((backend == 'bibtex' and 'aux') or 'bcf') ..
+               '" to files if not provided.', 74))
+
+    print()
+    print(wrap('Please make sure the ' .. plural(#invalid,
+               'path is', 'paths are') .. ' ' ..
+               'correct and the ' .. plural(#invalid,
+               'file exists', 'files exist') ..  '. ' ..
+               'There is nothing I can do at the moment. ' ..
+               'Refer to the user documentation for ' ..
+               'details on the file lookup. If ' .. plural(#invalid,
+               'this is not the file', 'these are not the ' ..
+               'files') .. ' you were expecting, ' ..
+               'double-check your source file or ' ..
+               'change the backend option when running ' ..
+               'this tool. The script will end now.', 74))
+    return 1
+  end
+
+  local lines = flatten(apply(auxiliary, read))
+  local asterisk, citations, bibliography = backends[backend](lines, true)
+
+  print()
+  print(wrap('Great, I found ' .. tostring(#citations) .. ' ' ..
+             plural(#citations, 'citation', 'citations') .. ' in ' ..
+             tostring(#auxiliary) .. ' ' .. plural(#auxiliary, 'file',
+             'files') ..'. I also found ' .. tostring(#bibliography) ..
+             ' ' .. 'bibliography ' .. plural(#bibliography, 'file',
+             'files') .. '. Let me check ' .. plural(#bibliography,
+             'this file', 'these files') .. ' and extract the ' ..
+             'references. Please wait a moment.', 74))
+
+  if asterisk then
+    print()
+    print(wrap('Also, it is worth noticing that I found a mention to ' ..
+               'a special "*" when retrieving citations. That means ' ..
+               'your TeX document contains "\\nocite{*}" somewhere in ' ..
+               'the source code. I will continue with the check ' ..
+               'nonetheless.', 74))
+  end
+
+  bibliography = apply(bibliography, function(a)
+                 return sanitize(a, 'bib') end)
+
+  invalid, bibliography = validate(bibliography, kpse, true, 'bib')
+  if #invalid ~= 0 then
+    print()
+    print(pad('-', 74))
+    print(wrap('I am sorry, but I was unable to locate ' ..
+               plural(#invalid, 'this file', 'these files') .. ' ' ..
+               '(the extension is automatically set to ' ..
+               '".bib", if not provided):', 74))
+    for _, v in ipairs(invalid) do
+      print('=> ' .. v)
+    end
+
+    print()
+    print(wrap('Please make sure the ' .. plural(#invalid,
+               'path is', 'paths are') .. ' ' ..
+               'correct and the ' .. plural(#invalid,
+               'file exists', 'files exist') ..  '. ' ..
+               'There is nothing I can do at the moment. ' ..
+               'Refer to to the user documentation ' ..
+               'for details on bibliography lookup. If ' ..
+               plural(#invalid, 'this is not the file',
+               'these are not the files') .. ' you were ' ..
+               'expecting (wrong bibliography), double-check ' ..
+               'your source file. The script will end ' ..
+               'now.', 74))
+    return 1
+  end
+
+  local references = flatten(apply(bibliography, function(a)
+                     return extract(read(a)) end))
+
+  local crossrefs = (keys['crossrefs'] and organize(apply(bibliography,
+                    function(a) return crossref(read(a)) end))) or {}
+
+  print()
+  print(wrap('Fantastic, I found ' .. tostring(#references) ..
+             ' ' .. plural(#references, 'reference',
+             'references') .. ' in ' .. tostring(#bibliography) ..
+             ' bibliography ' .. plural(#bibliography, 'file',
+             'files') .. '. Please wait a moment while the ' ..
+             plural(((check == 'all' and 2) or 1), 'report is',
+             'reports are') .. ' generated.', 74))
+
+  return operations[check](citations, references, crossrefs)
+end
+
+-- Call and exit
+os.exit(checkcites(arg))
+
+-- EOF
diff --git a/tests/immaculate/cleanedBib.csv b/tests/aux/cleanedBib.csv
similarity index 100%
rename from tests/immaculate/cleanedBib.csv
rename to tests/aux/cleanedBib.csv
diff --git a/tests/document.aux b/tests/aux/document.aux
similarity index 100%
rename from tests/document.aux
rename to tests/aux/document.aux
diff --git a/tests/pipeline.py b/tests/aux/pipeline.py
similarity index 57%
rename from tests/pipeline.py
rename to tests/aux/pipeline.py
index f0e1e70..999665b 100644
--- a/tests/pipeline.py
+++ b/tests/aux/pipeline.py
@@ -2,14 +2,14 @@
 from habanero import Crossref
 import sys
 import os
-wd = os.getcwd()
-print(f'{wd[0:-6]}/utils')
-print(f'{wd[0:-6]}/utils')
-sys.path.insert(1, f'{wd[0:-6]}/utils')
+from pathlib import Path
+wd = Path(os.getcwd())
+sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')
 from preprocessing import *
 
 cr = Crossref()
-homedir = '/home/jovyan/'
+#homedir = '/home/jovyan/'
+homedir = os.getcwd() + '/'
 bib_files = glob.glob(homedir + '*.bib')
 paper_aux_file = glob.glob(homedir + '*.aux')
 paper_bib_file = 'library_paper.bib'
@@ -18,9 +18,9 @@
 except:
     print('No optional .tex file found.')
 
-yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial'
-yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial'
-optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']
+yourFirstAuthor = 'Stiso, Jennifer '
+yourLastAuthor = 'Bassett, Dani '
+optionalEqualContributors = ['Zhou, Dale']
 checkingPublishedArticle = False
 
 ## end of user input
@@ -29,9 +29,9 @@
 
 bib_data = get_bib_data(homedir)
 if checkingPublishedArticle:
-    FA,LA = get_names_published(homedir, bib_data)
+    get_names_published(homedir, bib_data, cr)
 else:
     # find and print duplicates
     get_duplicates(bib_data)
     # get names, remove CDS, find self cites
-    FA,LA = get_names(bib_data)
\ No newline at end of file
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/tests/testBib_immaculate.bib b/tests/aux/testBib_immaculate.bib
similarity index 100%
rename from tests/testBib_immaculate.bib
rename to tests/aux/testBib_immaculate.bib
diff --git a/tests/cleanedBib.csv b/tests/cleanedBib.csv
new file mode 100644
index 0000000..1ca27bc
--- /dev/null
+++ b/tests/cleanedBib.csv
@@ -0,0 +1 @@
+Article,FA,LA,Title,SelfCite,CitationKey
diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py
new file mode 100644
index 0000000..999665b
--- /dev/null
+++ b/tests/erroneous/pipeline.py
@@ -0,0 +1,37 @@
+import glob
+from habanero import Crossref
+import sys
+import os
+from pathlib import Path
+wd = Path(os.getcwd())
+sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')
+from preprocessing import *
+
+cr = Crossref()
+#homedir = '/home/jovyan/'
+homedir = os.getcwd() + '/'
+bib_files = glob.glob(homedir + '*.bib')
+paper_aux_file = glob.glob(homedir + '*.aux')
+paper_bib_file = 'library_paper.bib'
+try:
+    tex_file = glob.glob(homedir + "*.tex")[0]
+except:
+    print('No optional .tex file found.')
+
+yourFirstAuthor = 'Stiso, Jennifer '
+yourLastAuthor = 'Bassett, Dani '
+optionalEqualContributors = ['Zhou, Dale']
+checkingPublishedArticle = False
+
+## end of user input
+if paper_aux_file:
+    find_unused_cites(paper_aux_file)
+
+bib_data = get_bib_data(homedir)
+if checkingPublishedArticle:
+    get_names_published(homedir, bib_data, cr)
+else:
+    # find and print duplicates
+    get_duplicates(bib_data)
+    # get names, remove CDS, find self cites
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/tests/testBib_erroneous.bib b/tests/erroneous/testBib_erroneous.bib
similarity index 100%
rename from tests/testBib_erroneous.bib
rename to tests/erroneous/testBib_erroneous.bib
diff --git a/tests/immaculate/cleanedBib_test.csv b/tests/immaculate/cleanedBib_test.csv
new file mode 100644
index 0000000..fb49df2
--- /dev/null
+++ b/tests/immaculate/cleanedBib_test.csv
@@ -0,0 +1,15 @@
+Article,FA,LA,Title,SelfCite,CitationKey
+2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory
+3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019
+4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network
+5,William,William,Moralia Volume VI,N,moralia2005
+6,Danielle,Perry,Curious Minds,N,bassett2022curious
+7,Danielle,Jennifer,fake,Y,fake2022
+8,,JH,N-gram language models,N,jurafsky2018n
+9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered
+10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender
+11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance
+12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black
+13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic
+14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian
+15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white
diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba7ab28fc57decb39b0b40da6dd0f71ec7b9b663
GIT binary patch
literal 13102
zcmc&)U2GiJb)LVS{o!)Cr1&e#%EWOkuS!vtVx%eSsFp?1W+G8cDF0|<hZ*i2l1uH*
zEbq)x;(E4CLkCt+yHWa+xUGv)X&)+}Xn~;UYk{Iqd1+AuMKKR~DNs~>Dq5f@1SITt
z&de@%Ny$hGx823uxpU{<Gxwfz?z!JN_u7MlIR(GQpSM>3?UJJW6Fv0)bUeI>%l`oq
zp$N66l<BV3)VfyI)VNG9>-^3r8@TJWRNX9_$-8tp&F?bh4DLoPThEnq$-9B_0LoIe
z{B=c`BK?k19uygo#l0YMVgUCckr#uw7ezq~;XW*iVi@-kF(QuPeoTytG2BPRC&h%A
zd`B&hi79a$CF9}=aRT=V@uWD3`=t1UuyCIe>atRL>H)^1ESA)0=!zXUw--I(_~)-X
zw*wTJtJPJMA(wxtx{AUjTl$W~vwFIsQp4nz!rjCL&ME${APJOBRj5J>!G$ShUB|N~
zjQ6#+8mgQ6o*`2ARLbwAgo&IMn7c|_54Abvt(mqFq(fs@75Ys>&V~k25b2v5^7>u|
z&*02Av?=ADsw!KvA{Xl0irfw{nowU>L>8kOXw?JzrejrI%WmGb)~YqfTJxlZ!C1np
zH0zEVOk2_kn$opuRX?!2HLFq$?4ato)=g)}&sq2d8+KsXlIhHV7crsLW?;EqK*T$=
z$|fk+ns%#D*>Ea1LCNuF@GOZ*#7I97UNd-@Y6fd(J{zgIhw7OJ8s06oUODqI{Yu#=
z=Qmdy(yKVWAL+8`M#j>@(q&1KiA>>CAloSAH)_=&GHBG1UUS^Yv>Oe_6{S=;17@ud
zvm#SEezO*oi%qxbJ7UF_(%xB-_O0?rU&&h4wQG@j`@SN_@Rf(ZboNy!h=0}(s=l{*
zb}r$ye^xl#XDc<wcA-dTn?beapKVAd>1@?qpK0tw!w1Y;0k;~>AP<`UI1<G;uIAO8
zW~v$7^Xe$>h9)QQrb2w|{AijQxL6SdG9lCusxuQPff_1MDoyC5JiF>16cTEv;cn8s
zt-0q>n-uNMK>sOf8-ek4HPoPPnU<Yng_&*G(ym*sU3X?JKadz06foF;>QNcjA;qHK
zTFqXcwV)6d-qvXD#Ux^_oVA!+E8S*Mit9W`V^9x%yuZKSDx8|OmZTRrmB10to{E%K
z>QrgkIu+?4T1rKe70=yvWWe)!x#3i*cJ0yzW^kmVYf&2Puh|tRno8>RR=MuIdVT)0
zQSLRn)^sjQ>B*8F8Pu7`ppHkmgYL29<Ag*dpGI;Lmwy6@lFmR_#=fB&KQ>c6c@yHJ
zntgVJG_`9-KEy+)>?!vYwiGJvTA+pMt_n+`vshmd+MW)f>B5LDLh=lo5kG56+i0iU
zIn+-<urBJFuqJV<^r2cOQOiKBuZ8NH8n?@%gcRbvP+3+21GYz-P{Py%)Q81h?Gt;g
z>ONlVvmLnx8;<3Jk%@jxTVzjeRf7$f8^@|wg@CalPQyOcy=_OZT09`FeyCcDlGdpQ
zq~~m09<YT;$srU(dc$kTVagesUe%Q(g_22g-E|@za~N40(_VF>)LK;{XU5+<B*Lf|
zfBGtSW)Xe)3@G`NNE9=JsYm~3ah=4K!)2)Q9NzWGvTsdz4{tlN#MB}gYN5t5d=b)s
zRQqH|&%H9#+XiH)u?*L`GNf9)GNf83QHx}Fn`KBPd6bY0-(eZ*w6d`5uKW*V=ef0=
z!(w9b$e+Ui<ugcP0m)PJOtO*BA}JXxAeN0hO{FZFl>B+hPg62O$yrLCqlB#K!JLYQ
zW9^9r5|Bhbj|#DAOeAth9_R7CPaf16QX4jnWLN`bSJ{J6yhr>{Tc`W^K?Z3BS1{XW
zmiVEhNiw?8Ae*F>z~hP~wVX=|(Cp9%<dbOd7A~L7waL$Y{iAYHa_%2-@}PfhHNcD&
z&>NU*%t4{!o)QN4Jk@%-ujh~Hb6H=UpNmqqU#V6jv%2nj(vf6LOC}FSej52`py^g=
znG|+lM_JUV+qG&7>!IUPu`EynEi&jMkwLW%kJCbpHC#T8MKSZLv~cfRq%!mZ7mxIx
zkr9{;LR%-h2HSK~mrI17;yJMntrupfRH5td8*Q>d8Cb9^E*1D_DS6J(GcIF_aFh-l
z^<Fy_rkL7w+y|x<EOfw-;GBMs>$Xjl7amcbhNVl3Ay_(-EFIAcj3BkEZY%yja^9q;
zp67H=-VE?W&+<CW*k6Fv-a-yKR&4!MpQMCWoweq!EfDx>G*^L0HXH%G6;!df#99}x
zr$IHS!LpKl<SaGln$f=c9V6>IwY5t?VV4Mw#Vy@tebtfFlwNkxp3flh==ZH>tFCZv
zqeH7Z)X8|7^NfB1B8zmC0Xp&G&T~A{F`kkUrM8-m+yUfsYFL@EY**{RTP9#t;9%9Q
z1Dx6GPHQmm0=sr5##w%pLTBrKIkjHh1|(Bw*p{G`U9Z-5TH3T#%0ya4MyXq#yy?qN
zpbN1{A-fw5?RZT&PZs@R+#!of3siaq20tz=W#s3mIK@0H5BBk>oQ{X$%a^H+T8(Yd
z89W@)@Mw${Cx2dg^*5xmww!R<LrtG7nUXd%Q_HBw)uK9zdq$nazlguNZ(^m;_Kkf_
zEBr*$bE>?8`bVq)tsFcz$aHpq+pel65DwM=`U~FybLO79OTHIyf5~W_>(j1=-Eibe
z2hq-24V#u&j~oJmxGc}s$g7NR542&{YH6Rlun4r1Y5=|k3^Air#cR3&CVruxYI!I@
z!&P^M-yG)EFaWYxNh+~h@aq_pPqSZ1<-r@dgq$Re>N62!Ji5kk4i5o<5R8L50dtL|
z6F3%2+UbCoO_6#ZOK70ujqZ*I#$GCcOti!T?oc_oMx}J?yL~L$@am4J%2|st7IIKn
zPwp^l7^7O?8hL!PmfeA~fl~vVShsF%R4W_FYho+h1nXJhtl4lItg6o*Qc}Dc6Dh@0
zx-&)LPpyyvjRj$x#&bEHRJ}f5(#nO3SFd~SN{plZ@?>nZS6t8a1KSn06f40_!|`J;
z3pi)J=B-L%TRF#{WM8H{0LrwOk<_J*w~@IN`>{rpPFN6`aUaXt!d#T;bnUP-^XpEq
z(s}vYpybm!s-z1LB&m8qlds{qPob$hw9K$V3uM*_&MKQ&dhaRh?h>fQGdvPn;s7uS
zWX3XM7q|dbH$td7+}%QuBJc!=W5}-t@LPnrmku%vi+M799m@nHooRiU`O02!r^nM4
zxMpDtmT1BWtYC?c{oKTD`6zYhz}Hw>=~D}Hr)I5F57kp_`&a_43k$AACdcaf%G1^@
za=e@PEPXlwv<IKJ;D^hq<CAkf8^b5KvFW5VGRPBWs3aG`IXOZJ;amAhN(e|rdfjdS
zUcE~B+`+ga9c|-QhoDq}gm_T#kI~R4!mwLv0hS65rk01DD(p+bD;3h1&X4??0bEq(
z{{Sh3spB%Rf~Rnqgjw2YTE!=nb|z3a$<xT(Q|0pk2@yG1u~e8Lw2Nu0f9rfZ8<?Bv
zFiUYC<T4_IRX!Ky_Ok0(<-ey=i`NyAGhh#PbyabT=M=y7oTBpo22%d!0H=9Q2N7e*
zLACz9T?h-j@(+kS6ly3OLTS`4hJ$<gut=1{;@%)Ifl4@lar3`~`R=<ka){30n;-Vp
z+bf7cU@vWNNDP7MurT(DaWpC%o`V{5nHdgmD|c1-TsVv`fC_z~Jsb{z=5Wjfid-dJ
zINigAQACM&{NeEW(b12b1J(ZVho(@0GR&>-Z+$4n+9Tn>9O}Gv`<qH@Bp430I~bW5
z-!<C$>x%o7p#&p$)t$eO`4=1$m`<v411+{bxS_b(bG@x5(8}nx(snW*(W(%P(wBC%
z_OYH8vyX4VdeGs(pSH8%F)?;iyQ|5+>uLn^;$3A;6_aA>olGzm4(_VN<Mt?NNSF%7
zLw#383a=&{<eJC(Yl^Wcg=8y?b(Uo?5ln_aZ{mpw<&Hx54q5=+1qV~^B)=Uwcq>lC
zHTpD=WT5#!>D9q-geictTOR~dVICC5ZW?!W`Pae3E_5`^L-V%VV|`<O<&jbct`|l)
z_UImmV~^>N?*B$#f07a1)wibO9t6k5lfh)%w_Plw;v`W*00$o78llrqw63yoyS3rL
z0dG_r4!O-7Haq~QyvQd;v$K%RU!3NKR`R=YkssMGJ}hvY#1Su5S7$2zcBJ0o#s0n;
zrCsM1ec_=x{ZI|>>rqzVi@=JGEN8cx6xTtprktiM7To)0IejsP#*sFEG19JFET^s$
zFfM007_2-1%*!#%FE~4kts(&q0f@QgNcb;UD9Q6^83ef>AcC|n=kVOOT9FgR#*GZz
zaU`Mg)(C}PgtDzl;EvZI%q=+XV3^iQz5V6(TL?*zg#|d$W}b`G%Hc&b`@*{8I=370
z!atzef8bJD&yWpf7`f8r?aT``uVUBy3p3pYtx+C9GHf6Hj(KR8ROx_D>|nFgOhC17
zBs2<IdynDbNYpxwWfXxO;HNuEsKG%oLiO9)ma*IeaqsJL6D{NykVL9~UuDc3kBO{*
zlzV|9r+_An3;#(K>quIsK6*Ue79U3sB_VCOMG3EL)GH~G{1;`YI}bIW*s*LjYrSyh
z40}6gz(>}L*7?^jera)G@#R@-5l}eRC-%W^*}jEv>~<B-ZyYO!w?)w+%l;I+9*&aZ
zO*{c+tR=*9f%xjS+q7#4iujH#fd@IrzTx0aLffCQ7Ur#si*wes#Va=`&j<niArQ&b
zPGR{c*M%WM{KYtAN|AR8v{$?{iE~1}8aaO44mpJ9so$(6(*!2q;L-b-9w}JIRea<Z
z{Lb4)(d&+F#=5@YxYOu{i%vJ;gL?cCC^hv>IMymO_vTFL30Va9BhB+8gS-ty4Q@kM
ze2S)1Brh^N1ll8=?0|Vo(oByAzxe94>z7xqE-Wrwy?7<c#Kqn%`36aIh!O%PQ5N_V
zejBzY*!KfTg2jy(0}b|hWYO@e?kZufD>R=tc#Eii%|W!C+^s0<z!i2R&>x2;<W>3z
z#iI_ORcTx<QvsnWIZw$tB^yYhH2Xp@QaSb`FOg?d&L@S79=XM7MDLe~0ci}CVRs8H
z%R{}h8THD;y#?H~JOFL#%=aioKyas=t?wjG0KP*eEvdlKvvMxcw8dt<$EW394Pja)
zbr;B)j|}+a^?2Iw1mYQe>=BdqFVSEBH<2i&0W3&=Y&`%M<}`S<>NGYY=$`{(qzLE;
zuDo^vZ>S^(U$>x6qr^a7!&|DA*Lrh>q^!s-#!<)GH#M}^K8u<K`WjOiSD(jr2Y+p!
zd}mXachIW}dyv#W{K&-Ra9zUX{|yWuA<+bbrYVwy_-Yy$^eH%zI^}>FDZ+DrK{G*W
z(*!Cd44T?ZGX~9u*}cq1V9<@e^;X@??&Zj#xvRhRVmpU+1K(5I13;qL5V5IEAkZl#
z9Dvc?+X`Ew-Eu^vfD7|LOebQb)boZAV?M@YVO|V@%3ymCxgzHZK_M6-JS2w2NV26<
z42JfAM}-lLY#zG@hj?$4Feb3-DE3EjZx3>-!q3>M5F;;&R|C_2Kk3bQdx)tG{fyLx
zK#kfUe#g{?(5tbK`b6|KIg*U=<DgH}hT4VJ^Ig8OB)|_d;Y;TZ#Kn9XXFe4nkzy<n
zVYG`0GCq*ofS4dW-&bomkrliH8i&6}T-(ELG|)e!$Kz8mz8?XX1A~UcVNpy0VGkyF
zdFrmV^TPx$AMfIFZEG*4dso}~5w1P>G(=zb72?_v`ntT(L;pDF7rONIo#&6x*SCQF
z3HJjlUcjXUPlVVrAY2#@hK0}!bGsRM6^WO@Gv^8D7<q3m!N+^d%n8ll1ZgKyNH#2n
zBVjH^;JsLSD+HF-{lDm)OIxAH0bZSGeUowIV_bN`6uf>NTG@Chn6dl~uUUiFUGqKm
z_$k=O>!i=i?4`No_=!K!aqg}87`k?ltY!6LS@{*%7r8<S#jE63DOshYLJ7q-<QkI2
zQla&EESVP8M;LJao{)%ZO71VvAOh)DwZ3ZCY@kwF(_N=2Jw)aSr6Nj@;8n&Pe(OBZ
zN?O7E$9weac7qxsLgz@Ai|iU&V*de4q~Chui%lD=HC5h5N4Bwz8Mj0DHzBoJ1#$oh
zKpg40ULEm}S{w#OV1$TRYY1_6BZFA!vC#Rgi8nuGT|!`rHVU}jviJH17Nyo}Z-2fC
z5VIfvjDDFQgSCv$H)DR!H*wT}!N?K?y5pAC<p4o6?!?kYwdVO=V`FF5x@a*$Yl#Aw
z?beN#nPI?Q-N6G~(6B+e0=5CSdDznD{&#pt(H7sM9bc<cRCllsExekyz5{&Z)P&zU
z7n8Q;SUA;Shr8_1>4;cM2;3~f6Iu1Vo7VQZnde$JzGO31`Tz)m2|v9gnI=ojN1%~e
z%=@0PKz{b#oNN8!BX}ci(1x=^vJDsuVyp!8D4f`pTI)hGc2V^Kt!V71-{CMf-l(o`
zoWbZCAZTqOu7#rxU0B<CDQW*~wO;Dc<PHU3>7IU6ymc26f-3fEdh6^W5bBwjjnsn*
z(63)z>!=6oA3;WlvMDj7*2_y00b7g@*ctSJvb$oS_S}HR>^s<mpt(fDh8Kyh(Ci~^
z=dEQ)yNB`xBxMr>5%i5+sVWsuQ9{07Iomy65N}7QZ%1l7c8JJx?Aao4@mP9N22|Rl
zWSf#(l-#CdhY|vT@@+~&N*LkZK#n#?NFU2-D7=rEhy7;+ml3c9$c^1Z-Y#U28f61%
zyFP6+M%g+J`S2s1u%Xiy(Lkf=R)To4$GapHFDwr;!FaPfp~%?!SE(akqvY3+-~bKG
z<O*a6U)e%I=~xdAeub(nP_-$qLFx-|y#?>*68r*L#V)=l{TM9oQq6ZL`35CC?l+Oc
z>8#F7<1jJ<XrJY90I<ti-HqkLyEPf=(k1d0*xT#6d{MRwTMO{d{67JG0;|0swWwwa
zqpAkrISL3l4yc(^pP|DTbZOn3L6CJ60Z%UHcLf06N0#auLZk@=+%o{e1aXOePJQYC
z;sU<*J!#_%XmJ!o9p9G}JFjG@$H|W%EJ0d;SH<7Mc<vw;epB;{cMw{|PRAs+d^8Hz
zylWsB2H^TmCIr|-AS}Zr2pV-uvhPwYI@NPfmU~x6yL3;<0GDKXO7ib2DAPo?r)*H<
z5Jc6*05)a+mA6%wzNx*dfmXh!))1Ef_H_GD#32WQcsa!Be+a-*KzOZyvSL#9L8ol%
zgD?{UB8LM29fRv;dpOX<5GW~eFcr|DfM6Bq04lyIitUj=y+crH_$Gx{U&2WxBQ)_m
zg6-NgY95ONWC-OQelwc9d69yoeN^rosXLz39X<Sp!-XS&Cljr|J%A^%d(P{1yvab@
zUWfKs`*xn<2FcL^*k2r)qPW2P!4~F`cLy7peXkxn75i$<U+jCl(s@i`cs!DaCHe%J
z<Y|jLLRsrtGS%aw7Yk&@?OJO#iGRc)2M%daD8#S8Cl4MQ&|qxV7XzYr8_fgKf1m6z
zN2#8tEl!ehfP)VJ?z&KR`RLOoj+O3e>zD~R{(wEedwo%?k}fAfLcUH3LA3iOAIpiI
z`=?P54Y3`hAWLEe-=n&;n|xo3178iwt8C{E#h2P-W+=i@z=Ve>*~d;UPIYPft&<04
zvnPu8=&0AnI$}iYPS60lBbMKx#}6p^T}ry<k|Sihj~l@enb3|M%j9`vSSHgf6Kodr
z%H)p1vVe^`EDQO2Xm~$$i2w4t)G6M)r#|8w%?1_FqH?IKclx`k5wkVpC))mzu6}Il
z!(BdLAe7*hqy2iH`bE2*1v-z3kj4LLpX7%}d-UyoP93P_5l($PLY2QyeMrQ|VX;TB
z={x<MIuIY@EXe(LhzKP|dilR7&!>)Z?+^cf<@pCkd-O;BtUHh=Z>St$mjMdPxcpHh
zJX7;DQ)!pHC$?WwIk*YNQ*<BaIp|!_Aw<O{;?MAP4x&fu4L)iqeKcszA((zNn$9KG
z>C>kdFJ8TDU0qmSMr7T(_9biS8nQ1@ynFG|wW~{4E?>WlTCHP`pb@Eyt@9n~KRvBd
zD&7Mpbc^%6&H6zY7oe3^l6dyAX`w*o0l<_DPv1mYxzGWZd`{|Id8q%{K3uSRsKxn1
z70w@SaelR&rByGE68C@Z(}x;=+CD_xm%l|FCmW98b|cV|)tDz8k8YeQeH%%QG>04$
zWiW%oMw*t<M$D1aXllful(2C|r92(Hf%PKf3afg8u`YtM2(BJS$hM#!*G4s_2(_Vg
zv~*n3u77!~vBL)(;abgb<~*GPjE@g~1{LHNlUj7%WQGp6tYVX7rt0D-JD)s>9VBvw
z%A+`yfm-pglbPh)3m;yQ-=g=}_jBlI1Rp@+lhW~8a-P2PBqbIl6w!~*5WPk@I<de3
zVusi}h8`{Nu7*+Og}Nu2HRl3Y79ZOpj?iJPo>a?>WF|ACnPZtO{wFfoToEO2WWMs>
DWK+_P

literal 0
HcmV?d00001

diff --git a/utils/__pycache__/queries.cpython-39.pyc b/utils/__pycache__/queries.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e5b128322dd86be564bcfd5e2eee6253afb9213
GIT binary patch
literal 838
zcmZuvO-~d-5bckd*<IL05F{=kIw8hnLu5`~3^7KCi5E37CWhNE)w?v?GqZO0f{>Y;
zSAKypfusM0U(g3Ho;Z8spjOWs<zOZCy54m6t2dQSd%Zb=vAH?fzZ($p-IJTakUYaQ
zT?8ONKao8G>@6Aa4A1|K4ptH74yO5uP;sDRrWb(KLD5CVk*z^c2M~^E9aa>=G1mtb
zIV2T>2wEcsZRk`~QkXjn>ZpofzE7Zw$;N@c{KI>mW48LF4hz7$Ugy&5z(U176S(z-
z!(yYi1h>z49ifL7gmwF$LWQ?sai(-mFVKNUbok#cuv~>z&?iS!fAc!U`i#SJpPVyT
zfjbxMoL@A)uuB`KvwFJy&WL#T^~*RdAWpTFnG&&;RtXc^GX5|X`Xt`br7>Cz<6)`e
zbZQT9QPNl!4y88s3I|DCpi(S<VL0MAO1fxv^osgz4y7CfE^HKCWP}nK3MiRN>!P7l
zR%qACg)!+!oc3&KQ}sv-Gga2OP+F0j(^g~b$;LI^aA8wl`FAnk+An(EgMR@o9LXb5
zIJ$LBW8d_+)MHdp<*{#~#YCl<aG{wfX$Kw0GOhj7&`St=otf=7rEy%sN7tHhqgVYd
z&kE~;BF)8DqVM6C>&FSsP@BY9Y0AT7x6G!wD6C0993?o;w0JJ}lc|-;B>wNEFk6!o
z*Yg_Yg)Z~=I9fj=^ALfsHjQYT1}wsAjV{tvWa~7*FAwMn@6w2`Gaci}R|DFtjg1>{
TANO2)J1=3X#8a>Q1C#s)E*jeE

literal 0
HcmV?d00001

diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index 890a056..bd2ba0a 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -1,6 +1,13 @@
 import subprocess
 from pylatexenc.latex2text import LatexNodes2Text
 import unicodedata
+import glob
+from pybtex.database.input import bibtex
+import os
+import csv
+from bibtexparser.bparser import BibTexParser
+import string
+from queries import *
 
 def checkcites_output(aux_file):
     '''take in aux file for tex document, return list of citation keys
@@ -23,7 +30,7 @@ def checkcites_output(aux_file):
 def clean_name(name, flag):
     """
 
-    :param name:
+    :param name: string author name
             flag: utf or latex
     :return: clean_name
     """
@@ -42,11 +49,13 @@ def clean_name(name, flag):
     else:
         raise ValueError
 
+    return clean_name
+
 def removeMiddleName(line):
     """
 
-    :param line:
-    :return:
+    :param line: string author name
+    :return: the same string, but with the middle name removed
     """
     arr = line.split()
     last = arr.pop()
@@ -62,11 +71,11 @@ def removeMiddleName(line):
     return str(first + ' ' + middle)
 
 
-def returnFirstName(line):
+def returnMiddletName(line):
     """
 
-    :param line:
-    :return:
+    :param line: string author name
+    :return: only the middle name
     """
     arr = line.split()
     n = len(arr)
@@ -102,9 +111,9 @@ def convertSpecialCharsToUTF8(text):
 def namesFromXrefSelfCite(doi, title):
     """
 
-    :param doi:
-    :param title:
-    :return:
+    :param doi: DOI of published article
+    :param title: the title of the same article
+    :return: selfCiteCheck: the number of self citations in a published article (indexed by DOI
     """
     selfCiteCheck = 0
     # get cross ref data
@@ -131,22 +140,25 @@ def find_unused_cites(paper_aux_file):
     """
 
     :param paper_aux_file: path to auxfile
-    :return:
     """
     print(checkcites_output(paper_aux_file))
     unused_in_paper = checkcites_output(paper_aux_file)  # get citations in library not used in paper
     print("Unused citations: ", unused_in_paper.count('=>'))
 
-def get_bib_data(homedir):
+def get_bib_data(homedir, parser=""):
     """
 
     :param homedir: home directory
+           parser: a string telling which parser to use (default is not to use bparser)
     :return: bib_data
     """
     ID = glob.glob(homedir + '*bib')
-    with open(ID[0]) as bibtex_file:
-        bib_data = bibtexparser.bparser.BibTexParser(common_strings=True,
-                                                     ignore_nonstandard_types=False).parse_file(bibtex_file)
+    if parser == 'bparser':
+        bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)
+    else:
+        parser = bibtex.Parser()
+        bib_data = parser.parse_file(ID[0])
+
     return bib_data
 
 def get_duplicates(bib_data):
@@ -157,7 +169,7 @@ def get_duplicates(bib_data):
     """
 
     duplicates = []
-    for key in bib_data.entries_dict.keys():
+    for key in bib_data.entries.keys():
         count = str(bib_data.entries).count("'ID\': \'" + key + "\'")
         if count > 1:
             duplicates.append(key)
@@ -167,7 +179,7 @@ def get_duplicates(bib_data):
                          ' '.join(map(str, duplicates)))
 
 
-def get_names_published(outPath, bib_data):
+def get_names_published(homedir, bib_data, cr):
     """
     whole pipeline for published papers
     :return: FA,
@@ -176,7 +188,6 @@ def get_names_published(outPath, bib_data):
     FA = []
     LA = []
     counter = 1
-    selfCiteCount = 0
     titleCount = 1  #
     counterNoDOI = list()  # row index (titleCount) of entries with no DOI
     outPath = homedir + 'cleanedBib.csv'
@@ -212,13 +223,13 @@ def get_names_published(outPath, bib_data):
     articleNum = 0
     for doi in citedArticleDOI:
         try:
-            FA = namesFromXref(doi, '', 'first')
+            FA = namesFromXref(cr, doi, '', 'first')
         except UnboundLocalError:
             sleep(1)
             continue
 
         try:
-            LA = namesFromXref(doi, '', 'last')
+            LA = namesFromXref(cr, doi, '', 'last')
         except UnboundLocalError:
             sleep(1)
             continue
@@ -256,14 +267,13 @@ def get_names_published(outPath, bib_data):
     return FA, LA
 
 
-def get_names(bib_data):
+def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr):
     """
     take bib_data, and get lists of first and last names. should also get self cites and CDS cites
     :return: FA
               LA
     """
     counter = 1
-    nameCount = 0
     outPath = homedir + 'cleanedBib.csv'
 
     if os.path.exists(outPath):
@@ -329,7 +339,7 @@ def get_names(bib_data):
         if FA == '' or len(FA.split('.')[0]) <= 1:
             while True:
                 try:
-                    FA = namesFromXref(doi, title, 'first')
+                    FA = namesFromXref(cr, doi, title, 'first')
                 except UnboundLocalError:
                     sleep(1)
                     continue
@@ -337,13 +347,13 @@ def get_names(bib_data):
         if LA == '' or len(LA.split('.')[0]) <= 1:
             while True:
                 try:
-                    LA = namesFromXref(doi, title, 'last')
+                    LA = namesFromXref(cr, doi, title, 'last')
                 except UnboundLocalError:
                     sleep(1)
                     continue
                 break
 
-        self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors)
+        selfCite = self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors, FA, LA, counter, key)
         counter += 1
         with open(outPath, 'a', newline='') as csvfile:
             writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
@@ -351,7 +361,7 @@ def get_names(bib_data):
                 [counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key])
 
 
-def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors):
+def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, FA, LA, counter, key):
     """
     take author list, and find self citations
 
@@ -359,8 +369,11 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor
     :param yourFirstAuthor:
     :param yourLastAuthor:
     :param optionalEqualContributors:
+    :param FA:
+    :param LA:
     :return:
     """
+
     if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or (
             yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'):
         raise ValueError("Please enter your manuscript's first and last author names")
@@ -381,74 +394,33 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor
                            [clean_name(s.rich_last_names, 'utf'),
                             LA]).replace("'","")]
     # I was in the process of cleaning all thisup when we stopped
-    selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
-        convertLatexSpecialChars(
-            str(s.rich_last_names)[
-            7:-3]).replace(
-            "', Protected('",
-            "").replace(
-            "'), '", ""),
-        convertLatexSpecialChars(
-            str(s.rich_first_names)[
-            7:-3]).replace(
-            "', Protected('",
-            "").replace(
-            "'), '",
-            "")]).replace(
-        "'", "")]
-    selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
-        convertSpecialCharsToUTF8(
-            str(s.rich_last_names)[
-            7:-3]).replace(
-            "', Protected('",
-            "").replace(
-            "'), '", ""),
-        convertSpecialCharsToUTF8(
-            str(s.rich_first_names)[
-            7:-3]).replace(
-            "', Protected('",
-            "").replace(
-            "'), '",
-            "")]).replace(
-        "'", "")]
-    selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([
-        convertSpecialCharsToUTF8(
-            str(s.rich_last_names)[
-            7:-3]).replace(
-            "', Protected('",
-            "").replace(
-            "'), '", ""),
-        FA]).replace("'",
-                     "")]
+    selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in
+                      str([clean_name(s.rich_last_names, 'utf'),
+                           clean_name(s.rich_first_names, 'utf')]
+                      ).replace("'", "")]
+    selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in
+                       str(
+                           [clean_name(s.rich_last_names, 'utf'),
+                            clean_name(s.rich_first_names, 'utf')]
+                       ).replace("'", "")]
+    selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in
+                       str(
+                            [clean_name(s.rich_last_names, 'utf'),
+                            FA]).replace("'","")]
 
     nameCount = 0
     if optionalEqualContributors != (
             'LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):
         for name in optionalEqualContributors:
-            selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars(
-                str(s.rich_last_names)[7:-3]).replace("', Protected('", "").replace("'), '", ""),
-                                                                                  convertLatexSpecialChars(
-                                                                                      str(s.rich_first_names)[
-                                                                                      7:-3]).replace(
-                                                                                      "', Protected('",
-                                                                                      "").replace("'), '",
-                                                                                                  "")]).replace(
-                "'", "")]
-            selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([
-                convertSpecialCharsToUTF8(
-                    str(s.rich_last_names)[
-                    7:-3]).replace(
-                    "', Protected('",
-                    "").replace(
-                    "'), '", ""),
-                convertSpecialCharsToUTF8(
-                    str(s.rich_first_names)[
-                    7:-3]).replace(
-                    "', Protected('",
-                    "").replace(
-                    "'), '",
-                    "")]).replace("'",
-                                  "")]
+            selfCiteCheck3 = [s for s in author if removeMiddleName(name) in
+                              str( [clean_name(s.rich_last_names, 'utf'),
+                           clean_name(s.rich_first_names, 'utf')]
+                      ).replace("'", "")]
+            selfCiteCheck3a = [s for s in author if removeMiddleName(name) in
+                               str(
+                                   [clean_name(s.rich_last_names, 'utf'),
+                                    clean_name(s.rich_first_names, 'utf')]
+                               ).replace("'", "")]
             if len(selfCiteCheck3) > 0:
                 nameCount += 1
             if len(selfCiteCheck3a) > 0:
@@ -469,6 +441,8 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor
         else:
             print(str(counter) + ": " + key)
 
+    return selfCite
+
 
 
From f4d2c9fdf68e4f7b3ea9345a37949e632dcd02d1 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Tue, 17 May 2022 11:28:04 -0400
Subject: [PATCH 10/47] automatically removed duplicates

---
 tests/aux/cleanedBib.csv               |  28 +--
 tests/aux/pipeline.py                  |   4 +-
 tests/aux/testBib_immaculate_clean.bib | 237 +++++++++++++++++++++++++
 3 files changed, 253 insertions(+), 16 deletions(-)
 create mode 100644 tests/aux/testBib_immaculate_clean.bib

diff --git a/tests/aux/cleanedBib.csv b/tests/aux/cleanedBib.csv
index fb49df2..7f322e7 100644
--- a/tests/aux/cleanedBib.csv
+++ b/tests/aux/cleanedBib.csv
@@ -1,15 +1,15 @@
 Article,FA,LA,Title,SelfCite,CitationKey
-2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory
-3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019
-4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network
-5,William,William,Moralia Volume VI,N,moralia2005
-6,Danielle,Perry,Curious Minds,N,bassett2022curious
-7,Danielle,Jennifer,fake,Y,fake2022
-8,,JH,N-gram language models,N,jurafsky2018n
-9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered
-10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender
-11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance
-12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black
-13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic
-14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian
-15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white
+2,Danielle,Perry,,N,bassett2022curious
+3,Gyorgy,Edvard,,N,buzsaki2013memory
+4,Paula,Rachel,,N,chatterjee2021gender
+5,Andrew,Michelle,,N,ethnicolr2022asian
+6,Denzel,Ketanji,,N,ethnicolr2022black
+7,Rafael,Alexandria,,N,ethnicolr2022hispanic
+8,Nicolas,Meryl,,N,ethnicolr2022white
+9,Danielle,Jennifer,,Y,fake2022
+10,Jacqueline,Bradley,,N,fulvio2021imbalance
+11,,JH,,N,jurafsky2018n
+12,,Dina,,N,Lundine2019
+13,Sara,Holly,,N,mitchell2013gendered
+14,William,William,,N,moralia2005
+15,Perry,Danielle,,N,zurn2020network
diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py
index 999665b..4bd4640 100644
--- a/tests/aux/pipeline.py
+++ b/tests/aux/pipeline.py
@@ -27,11 +27,11 @@
 if paper_aux_file:
     find_unused_cites(paper_aux_file)
 
-bib_data = get_bib_data(homedir)
+bib_data = get_bib_data(bib_files[0])
 if checkingPublishedArticle:
     get_names_published(homedir, bib_data, cr)
 else:
     # find and print duplicates
-    get_duplicates(bib_data)
+    bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/tests/aux/testBib_immaculate_clean.bib b/tests/aux/testBib_immaculate_clean.bib
new file mode 100644
index 0000000..dc0ece8
--- /dev/null
+++ b/tests/aux/testBib_immaculate_clean.bib
@@ -0,0 +1,237 @@
+@inproceedings{ambekar2009name,
+ author = {Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven},
+ booktitle = {Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining},
+ pages = {49--58},
+ title = {Name-ethnicity classification from open sources},
+ year = {2009}
+}
+
+@book{bassett2022curious,
+ author = {Danielle S. Bassett and Perry Zurn},
+ publisher = {MIT Press},
+ title = {Curious Minds},
+ year = {2022}
+}
+
+@article{bertolero2021racial,
+ author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and  Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.},
+ journal = {bioRxiv},
+ title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender},
+ xoi = {10.1101/2020.10.12.336230},
+ year = {2020}
+}
+
+@article{buzsaki2013memory,
+ author = {Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Memory, navigation and theta rhythm in the hippocampal-entorhinal system},
+ volume = {16},
+ year = {2013}
+}
+
+@article{caplar2017quantitative,
+ author = {Caplar, Neven and Tacchella, Sandro and Birrer, Simon},
+ journal = {Nature Astronomy},
+ number = {6},
+ pages = {0141},
+ publisher = {Nature Publishing Group},
+ title = {Quantitative evaluation of gender bias in astronomical publications from citation counts},
+ volume = {1},
+ year = {2017}
+}
+
+@article{chatterjee2021gender,
+ author = {Chatterjee, Paula and Werner, Rachel M},
+ journal = {JAMA Netw Open},
+ number = {7},
+ pages = {e2114509},
+ title = {Gender Disparity in Citations in High-Impact Journal Articles},
+ volume = {4},
+ year = {2021}
+}
+
+@article{dion2018gendered,
+ author = {Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin},
+ journal = {Political Analysis},
+ number = {3},
+ pages = {312--327},
+ publisher = {Cambridge University Press},
+ title = {Gendered citation patterns across political science and social science methodology fields},
+ volume = {26},
+ year = {2018}
+}
+
+@article{Dworkin2020.01.03.894378,
+ abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.},
+ author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.},
+ doi = {10.1101/2020.01.03.894378},
+ elocation-id = {2020.01.03.894378},
+ eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf},
+ journal = {bioRxiv},
+ publisher = {Cold Spring Harbor Laboratory},
+ title = {The extent and drivers of gender imbalance in neuroscience reference lists},
+ url = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378},
+ year = {2020}
+}
+
+@article{ethnicolr2022asian,
+ author = {Wang, Andrew and Yeoh, Michelle},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022black,
+ author = {Washington, Denzel and Brown-Jackson, Ketanji},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022hispanic,
+ author = {Cruz, Rafael and Ocasio-Cortez, Alexandria},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022white,
+ author = {Coppola, Nicolas and Streep, Meryl},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@book{fake2022,
+ author = {Danielle S. Bassett and Dale Zhou and Jennifer Stiso},
+ publisher = {MIT Press},
+ title = {fake},
+ year = {2022}
+}
+
+@article{fulvio2021imbalance,
+ author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R},
+ journal = {J Cogn Neurosci},
+ number = {1},
+ pages = {3-7},
+ title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience},
+ volume = {33},
+ year = {2021}
+}
+
+@article{jurafsky2018n,
+ author = {Jurafsky, D and Martin, JH},
+ journal = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
+ title = {N-gram language models},
+ year = {2018}
+}
+
+@article{Lundine2019,
+ abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.},
+ author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina},
+ doi = {10.1016/j.socscimed.2019.112388},
+ file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf},
+ issn = {18735347},
+ journal = {Social Science and Medicine},
+ keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing},
+ number = {January},
+ pages = {112388},
+ pmid = {31288167},
+ publisher = {Elsevier},
+ title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}},
+ url = {https://doi.org/10.1016/j.socscimed.2019.112388},
+ volume = {235},
+ year = {2019}
+}
+
+@article{maliniak2013gender,
+ author = {Maliniak, Daniel and Powers, Ryan and Walter, Barbara F},
+ journal = {International Organization},
+ number = {4},
+ pages = {889--922},
+ publisher = {Cambridge University Press},
+ title = {The gender citation gap in international relations},
+ volume = {67},
+ year = {2013}
+}
+
+@article{mitchell2013gendered,
+ author = {Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly},
+ journal = {International Studies Perspectives},
+ number = {4},
+ pages = {485--492},
+ publisher = {Blackwell Publishing Ltd Oxford, UK},
+ title = {Gendered citation patterns in international relations journals},
+ volume = {14},
+ year = {2013}
+}
+
+@book{moralia2005,
+ author = {Plutarch, Helmbold, William},
+ publisher = {Harvard University Press},
+ title = {Moralia, Volume VI},
+ year = {1939}
+}
+
+@article{sood2018predicting,
+ author = {Sood, Gaurav and Laohaprapanon, Suriyan},
+ journal = {arXiv preprint arXiv:1805.02109},
+ title = {Predicting race and ethnicity from the sequence of characters in a name},
+ year = {2018}
+}
+
+@article{wang2021gendered,
+ author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.},
+ doi = {10.1080/23808985.2021.1960180},
+ journal = {Annals of the International Communication Association},
+ title = {Gendered citation practices in the field of communication},
+ year = {2021}
+}
+
+@software{zhou_dale_2020_3672110,
+ author = {Zhou, Dale and
+Cornblath, Eli J. and
+Stiso, Jennifer and
+Teich, Erin G. and
+Dworkin, Jordan D. and
+Blevins, Ann S. and
+Bassett, Danielle S.},
+ doi = {10.5281/zenodo.3672110},
+ month = {February},
+ publisher = {Zenodo},
+ title = {Gender Diversity Statement and Code Notebook v1.0},
+ url = {https://doi.org/10.5281/zenodo.3672110},
+ version = {v1.0},
+ year = {2020}
+}
+
+@article{zurn2020network,
+ author = {Zurn, Perry and Bassett, Danielle S},
+ journal = {Philosophical Transactions of the Royal Society B},
+ number = {1796},
+ pages = {20190323},
+ publisher = {The Royal Society},
+ title = {Network architectures supporting learnability},
+ volume = {375},
+ year = {2020}
+}
+

From 3a0c1eb16b879d11e66da04b62c1dfdeee750221 Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Tue, 17 May 2022 12:32:27 -0400
Subject: [PATCH 11/47] updating pipelines

---
 cleanBib.ipynb                                |   4 +-
 tests/erroneous/cleanedBib.csv                |  15 ++
 tests/erroneous/pipeline.py                   |   4 +-
 tests/erroneous/testBib_erroneous_clean.bib   | 237 ++++++++++++++++++
 tests/immaculate/cleanedBib.csv               |  15 ++
 tests/immaculate/pipeline.py                  |   4 +-
 .../__pycache__/preprocessing.cpython-39.pyc  | Bin 13102 -> 13808 bytes
 utils/preprocessing.py                        |  49 +++-
 8 files changed, 311 insertions(+), 17 deletions(-)
 create mode 100644 tests/erroneous/cleanedBib.csv
 create mode 100644 tests/erroneous/testBib_erroneous_clean.bib
 create mode 100644 tests/immaculate/cleanedBib.csv

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index a5f6cff..4b9593d 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -137,12 +137,12 @@
     "if paper_aux_file:\n",
     "    find_unused_cites(paper_aux_file)\n",
     "\n",
-    "bib_data = get_bib_data(homedir)\n",
+    "bib_data = get_bib_data(bib_files[0])\n",
     "if checkingPublishedArticle:\n",
     "    get_names_published(homedir, bib_data, cr)\n",
     "else:\n",
     "    # find and print duplicates\n",
-    "    get_duplicates(bib_data)\n",
+    "    bib_data = get_duplicates(bib_data, bib_files[0])\n",
     "    # get names, remove CDS, find self cites\n",
     "    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)"
    ]
diff --git a/tests/erroneous/cleanedBib.csv b/tests/erroneous/cleanedBib.csv
new file mode 100644
index 0000000..7f322e7
--- /dev/null
+++ b/tests/erroneous/cleanedBib.csv
@@ -0,0 +1,15 @@
+Article,FA,LA,Title,SelfCite,CitationKey
+2,Danielle,Perry,,N,bassett2022curious
+3,Gyorgy,Edvard,,N,buzsaki2013memory
+4,Paula,Rachel,,N,chatterjee2021gender
+5,Andrew,Michelle,,N,ethnicolr2022asian
+6,Denzel,Ketanji,,N,ethnicolr2022black
+7,Rafael,Alexandria,,N,ethnicolr2022hispanic
+8,Nicolas,Meryl,,N,ethnicolr2022white
+9,Danielle,Jennifer,,Y,fake2022
+10,Jacqueline,Bradley,,N,fulvio2021imbalance
+11,,JH,,N,jurafsky2018n
+12,,Dina,,N,Lundine2019
+13,Sara,Holly,,N,mitchell2013gendered
+14,William,William,,N,moralia2005
+15,Perry,Danielle,,N,zurn2020network
diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py
index 999665b..4bd4640 100644
--- a/tests/erroneous/pipeline.py
+++ b/tests/erroneous/pipeline.py
@@ -27,11 +27,11 @@
 if paper_aux_file:
     find_unused_cites(paper_aux_file)
 
-bib_data = get_bib_data(homedir)
+bib_data = get_bib_data(bib_files[0])
 if checkingPublishedArticle:
     get_names_published(homedir, bib_data, cr)
 else:
     # find and print duplicates
-    get_duplicates(bib_data)
+    bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/tests/erroneous/testBib_erroneous_clean.bib b/tests/erroneous/testBib_erroneous_clean.bib
new file mode 100644
index 0000000..dc0ece8
--- /dev/null
+++ b/tests/erroneous/testBib_erroneous_clean.bib
@@ -0,0 +1,237 @@
+@inproceedings{ambekar2009name,
+ author = {Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven},
+ booktitle = {Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining},
+ pages = {49--58},
+ title = {Name-ethnicity classification from open sources},
+ year = {2009}
+}
+
+@book{bassett2022curious,
+ author = {Danielle S. Bassett and Perry Zurn},
+ publisher = {MIT Press},
+ title = {Curious Minds},
+ year = {2022}
+}
+
+@article{bertolero2021racial,
+ author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and  Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.},
+ journal = {bioRxiv},
+ title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender},
+ xoi = {10.1101/2020.10.12.336230},
+ year = {2020}
+}
+
+@article{buzsaki2013memory,
+ author = {Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Memory, navigation and theta rhythm in the hippocampal-entorhinal system},
+ volume = {16},
+ year = {2013}
+}
+
+@article{caplar2017quantitative,
+ author = {Caplar, Neven and Tacchella, Sandro and Birrer, Simon},
+ journal = {Nature Astronomy},
+ number = {6},
+ pages = {0141},
+ publisher = {Nature Publishing Group},
+ title = {Quantitative evaluation of gender bias in astronomical publications from citation counts},
+ volume = {1},
+ year = {2017}
+}
+
+@article{chatterjee2021gender,
+ author = {Chatterjee, Paula and Werner, Rachel M},
+ journal = {JAMA Netw Open},
+ number = {7},
+ pages = {e2114509},
+ title = {Gender Disparity in Citations in High-Impact Journal Articles},
+ volume = {4},
+ year = {2021}
+}
+
+@article{dion2018gendered,
+ author = {Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin},
+ journal = {Political Analysis},
+ number = {3},
+ pages = {312--327},
+ publisher = {Cambridge University Press},
+ title = {Gendered citation patterns across political science and social science methodology fields},
+ volume = {26},
+ year = {2018}
+}
+
+@article{Dworkin2020.01.03.894378,
+ abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.},
+ author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.},
+ doi = {10.1101/2020.01.03.894378},
+ elocation-id = {2020.01.03.894378},
+ eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf},
+ journal = {bioRxiv},
+ publisher = {Cold Spring Harbor Laboratory},
+ title = {The extent and drivers of gender imbalance in neuroscience reference lists},
+ url = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378},
+ year = {2020}
+}
+
+@article{ethnicolr2022asian,
+ author = {Wang, Andrew and Yeoh, Michelle},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022black,
+ author = {Washington, Denzel and Brown-Jackson, Ketanji},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022hispanic,
+ author = {Cruz, Rafael and Ocasio-Cortez, Alexandria},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@article{ethnicolr2022white,
+ author = {Coppola, Nicolas and Streep, Meryl},
+ journal = {Nature neuroscience},
+ number = {2},
+ pages = {130},
+ publisher = {Nature Publishing Group},
+ title = {Test of ethnicolr},
+ volume = {16},
+ year = {2013}
+}
+
+@book{fake2022,
+ author = {Danielle S. Bassett and Dale Zhou and Jennifer Stiso},
+ publisher = {MIT Press},
+ title = {fake},
+ year = {2022}
+}
+
+@article{fulvio2021imbalance,
+ author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R},
+ journal = {J Cogn Neurosci},
+ number = {1},
+ pages = {3-7},
+ title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience},
+ volume = {33},
+ year = {2021}
+}
+
+@article{jurafsky2018n,
+ author = {Jurafsky, D and Martin, JH},
+ journal = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition},
+ title = {N-gram language models},
+ year = {2018}
+}
+
+@article{Lundine2019,
+ abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.},
+ author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina},
+ doi = {10.1016/j.socscimed.2019.112388},
+ file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf},
+ issn = {18735347},
+ journal = {Social Science and Medicine},
+ keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing},
+ number = {January},
+ pages = {112388},
+ pmid = {31288167},
+ publisher = {Elsevier},
+ title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}},
+ url = {https://doi.org/10.1016/j.socscimed.2019.112388},
+ volume = {235},
+ year = {2019}
+}
+
+@article{maliniak2013gender,
+ author = {Maliniak, Daniel and Powers, Ryan and Walter, Barbara F},
+ journal = {International Organization},
+ number = {4},
+ pages = {889--922},
+ publisher = {Cambridge University Press},
+ title = {The gender citation gap in international relations},
+ volume = {67},
+ year = {2013}
+}
+
+@article{mitchell2013gendered,
+ author = {Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly},
+ journal = {International Studies Perspectives},
+ number = {4},
+ pages = {485--492},
+ publisher = {Blackwell Publishing Ltd Oxford, UK},
+ title = {Gendered citation patterns in international relations journals},
+ volume = {14},
+ year = {2013}
+}
+
+@book{moralia2005,
+ author = {Plutarch, Helmbold, William},
+ publisher = {Harvard University Press},
+ title = {Moralia, Volume VI},
+ year = {1939}
+}
+
+@article{sood2018predicting,
+ author = {Sood, Gaurav and Laohaprapanon, Suriyan},
+ journal = {arXiv preprint arXiv:1805.02109},
+ title = {Predicting race and ethnicity from the sequence of characters in a name},
+ year = {2018}
+}
+
+@article{wang2021gendered,
+ author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.},
+ doi = {10.1080/23808985.2021.1960180},
+ journal = {Annals of the International Communication Association},
+ title = {Gendered citation practices in the field of communication},
+ year = {2021}
+}
+
+@software{zhou_dale_2020_3672110,
+ author = {Zhou, Dale and
+Cornblath, Eli J. and
+Stiso, Jennifer and
+Teich, Erin G. and
+Dworkin, Jordan D. and
+Blevins, Ann S. and
+Bassett, Danielle S.},
+ doi = {10.5281/zenodo.3672110},
+ month = {February},
+ publisher = {Zenodo},
+ title = {Gender Diversity Statement and Code Notebook v1.0},
+ url = {https://doi.org/10.5281/zenodo.3672110},
+ version = {v1.0},
+ year = {2020}
+}
+
+@article{zurn2020network,
+ author = {Zurn, Perry and Bassett, Danielle S},
+ journal = {Philosophical Transactions of the Royal Society B},
+ number = {1796},
+ pages = {20190323},
+ publisher = {The Royal Society},
+ title = {Network architectures supporting learnability},
+ volume = {375},
+ year = {2020}
+}
+
diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv
new file mode 100644
index 0000000..7f322e7
--- /dev/null
+++ b/tests/immaculate/cleanedBib.csv
@@ -0,0 +1,15 @@
+Article,FA,LA,Title,SelfCite,CitationKey
+2,Danielle,Perry,,N,bassett2022curious
+3,Gyorgy,Edvard,,N,buzsaki2013memory
+4,Paula,Rachel,,N,chatterjee2021gender
+5,Andrew,Michelle,,N,ethnicolr2022asian
+6,Denzel,Ketanji,,N,ethnicolr2022black
+7,Rafael,Alexandria,,N,ethnicolr2022hispanic
+8,Nicolas,Meryl,,N,ethnicolr2022white
+9,Danielle,Jennifer,,Y,fake2022
+10,Jacqueline,Bradley,,N,fulvio2021imbalance
+11,,JH,,N,jurafsky2018n
+12,,Dina,,N,Lundine2019
+13,Sara,Holly,,N,mitchell2013gendered
+14,William,William,,N,moralia2005
+15,Perry,Danielle,,N,zurn2020network
diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py
index 999665b..4bd4640 100644
--- a/tests/immaculate/pipeline.py
+++ b/tests/immaculate/pipeline.py
@@ -27,11 +27,11 @@
 if paper_aux_file:
     find_unused_cites(paper_aux_file)
 
-bib_data = get_bib_data(homedir)
+bib_data = get_bib_data(bib_files[0])
 if checkingPublishedArticle:
     get_names_published(homedir, bib_data, cr)
 else:
     # find and print duplicates
-    get_duplicates(bib_data)
+    bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc
index ba7ab28fc57decb39b0b40da6dd0f71ec7b9b663..b95c194989df590caedee869c0a591ccecbeda2f 100644
GIT binary patch
delta 2660
zcmZuzU2Ggz6~1R?_Q&hp_1ccTYkO@^LhH>Yj^miWU{b3Rp@~&m8d{Qg)3B`fj_t{g
zch`4j>)MQKnhlC0T55CEDx|cmN%;u^q-uqP0Of%Pq$2V2zyr(!LOirWyi^KL2w~2d
zb(5H8cIUfi&i}ploICT=x%W%)LM#?xc*)<@XM?A=;%V@<4(Havisqu6@qiV3hT#ZS
zJlDa4JamQS5<JY?aO~s}-i~7zkMbCf-8{}aaNNfeyc5SB-o?9d?B)A-4~|K`pC@_h
z3gl9}kM|>|j}P!Pj{SU)58*h#hq;bp`t5(+R3P5GuKpZAZ=MPK0pNIZe`pKfbFvw3
zLSD|dy$1`;&qwYku+;oaECYit<1FkT%_R_pFo>s+@<OxKg0KxCf2sDL_ylrX;80S8
z?l7)!m1{364R92f)nOaflyaA&HneTF1}-$fWyEupr&%^2--~}MAkLr%`AWywWHul)
zq;nD9eSXfg3{R_6jj~WFZ15>RYF72aBq|pc4KKKxNi>mJe+`vvf8E1jNU33nDB>LQ
ziVDI9R~QMp$)o)-U~HaYHtZ5>z&1O{E?&RJDDz8d9U2;k4VJ_?B`IgsRk>Qy#AR1$
z1U8TZ%n3F^SJ@nJjR&r*hso@uKWQH9V{4kL;bMfornG2u-n$kcorbUJ2Ksy_OP3xT
z&<_@O+IYAQmtiUHL|ml~JXq2;L5w)<yp2a1Q74AhN?qB8b<DIqiQ$3vBgCT=n8%WA
zE$GDEAdlCTSJG=Cr=yW@nH$<xfSqGJq%n=vRbb_=V{8-b#5e>0H-qSOqFqN_S!3l0
z($Jz^hI^M-U5@B@Ifas;&ziG&F8`i5wca8NPC;KborOxx(Rr<EnZ<%**uMU1C^LOB
zGpT3(4H<C|H=&4oaQC8}q7vxF+fE_OGnZXVP~~ve$2M3=*;1A?j1M_w`lRSVm8{|^
zW8-2UlDC53sU>4Y4yStaKSRbHyxHo3dBe%y%VX?>RVfxMdkP8Dc8Zn7>Xe`?o?Jcn
z#d2ZRGIXb++qGiRu<f~;Wv%GKSgbr}+&7oiv+}XdN7i*`!O-U_mQ`6c%k#QXc7$oz
zlM!8vP~iE9K0SvzrmfSzT&xIT6rEAMY8eIF&<$=n`bwpykKqdHRMZrC-o#L~vbj_<
zbd#e`bIvq`jd6rtu&fV6!kAk(gi*#UC;#!Eot>T4NWO@E%jC?1Wify$WV^iRZdQ42
z79CH+<*~&OioDRSKs<?0qKjBcx#|U$7tpO2Y|UBhCt)YS69k7yO<gQh#TOV`XI>P8
zImYv?HuN-JTdev`mko=vr2h=*NA4}YAIwut{=9e`MY&MfSk9x5Tzkur-}U~!iX7SZ
zl6`#d2L1qf_87uA3uzFC6rM~Gbx5HTYM+{hh!O|n?<k7$-(VmCQ5aRDN(4lynd~~F
z1w<ZgWvTZs0Wm|=>EvMWeI&$1SxTM)A)Cn=n31tmx?Mbr`~(&U%08W18d@bamw-yo
z&(2rr@>QC@O-YCdgG9JI7^6P}vr_iI(C>v7DvP+^qO~lKk%~i5lbL}}!DU$(*c$d7
z`!nvQ8Pf@8=c~0@EYk&p|50|PUx%&cPtwN~NXmDIK8rXyeDsqkp4!IV?8Dy2a~L)i
z@z>QZr_T+zoo<5n)wQ+QIaWTPF{gi1S@~0|znua3&Ef3R|8J{`srzjE(Kde9R$ajZ
zlYY(s2RdhF*rvLKSXRb~nxZEsE#vyB(4ZTugEJ_f)!%u@qIuGq`z0wBiK-A(36==#
z=JfvCDm*8{4<DagK~8o^yg-6=g0B+XT~+ZSJzgfL6I>yfB`C<%hgT-POk@YaRe}wI
zYY1M%HmtdP(ZsVz=*Y-+i?5ULtkg%wCSM|w-gn{~1it?ZNa3p&8&cS&vqFWMcTC4Z
z2Pa5rcLz5{J`M-v?U4*9^4*bZ@J6$mxv0W<`TIkMM_(nyVT4<XUvA<nM1z<n{@yCO
zE`N0-B_BC_`QUjHQUi<c5Ns2Cmte2Kn`lsa^d<^!1;jCuQ)i2Z2pR+w*SF3pqDJ%s
zqTZBe#!~V}qZiuA(QZJQIMNfK;{TyMbfof;`y{0EJ%YWd{FqWX<m<gfQc4+v`YUw$
z!|HEiP`&L<5)jC#BYVUDZLAgk7Y_>m{SW)QEnhnNC2ZP%9_>$3+VA)ciG&9;f<6kw
znoN)Xy61HwTiZsIzv&m`o$<Ghew&_YPWTyePvHP@{rvrvqvjMh5U$GK9D5>0R2KX%
znCzg`Cq{yvHg8pC<>M2BLBD=ql;w%xp?Na9j*tuCsZQ^RT%e3c<%%dGQ{I@!_RyD#
hC=qNCyg_h7{&`{q#^ugLuk8G2Z>Tez4WAF^{|7(sjL851

delta 2003
zcmZvdO>7fK6oB{bu76_UbrS5vc5H_vEzSl=C`~9tkP9i4N=-qb5DXzCo=xKX*%>cC
zah-<NR02^{%upzy6^Y8By}-e0g;c2rdTA@vs%nMWJ@i_&mm(oms2BR)8W2$!YrpsA
z&3kX&%<PWuPuv{QhP+;nge`L`H++9bQH#J2#dV2!$7JO`i8-0;w3P5NH>*IYF%MgX
z(#I;97iAUGm=9$&t76qCYgi3ijdC^fvj9pzYh-mScp4G`7Ghzv)UtXOL0QM5EQT`3
z8rT|?AqIVt-t-q0(yN2zEBQglP+9t1xeX924LN=S*j#$aRRnlleCsa3fH+a{CybN^
zJr86^m-@VMm_LWnStsRiRDv7$9@Ka}A}N#caZ^#!vjd-O55kldrM$$TPFjLFIS)*p
zka?>KOkpZ>oR=46Q^65sy4F+|9ZS+2<Y7_H%gh;~2qmvz?XKJv@sD=Jms4AZQ^N_@
zaAt^S4KB|4+TP@Vv*=Ds9i2=MbDBHhv4;moQj<nv6$Y5bNxQ=p9JQQ#>|kBBTx01e
zgQd9TEayweyL-d~Uwr-&`eiE-l1sz2`eiq0GT)E=B+d4u(PS=s?8P!vMyY&hTv~)B
zsYe=Uo0CnM`UO~2806(TiQ!XT7E>lvuHj<Uyu1V;&1j}04+UWAM5D@&noj0m&PA8$
z#uQLMmxBBrFc<EIJ9kKIZ;pr?)m>4~8f-aNe0NX0Yfbz=i1QY#!`_ozxA?7Q`(PJe
zi<Vx!!g3kYCQlhzOPw%gvphzQy@VP<HKCEvim;Tap^UEbw@CM}W0}cRa>&G31!K!m
z0rCxq8GkT(1Jx{Lkz6Wh5X3#Jlq&)CM|_}k$G=xqEI1*8q2J?{+_uqzBriu(iduwK
zv2E0tHcn=E_dV=BkVGiF3k)$7J_zID>u{vb@@$(-WzA%ID%1Thg-s!HaS;p0grmMU
zmL}T_!Ol!cIY&GsWvVn&FF>=L+?YxqH!L-sF{XtUjYGfK9DS#j_mLN^mG2~s62`=j
z(XEgZ?pP6K#6m0zUy0kXYjCEtuVIS}72<s3tH@s(+h4)&a0!?%ga^%F0xrn>k6euz
z%scYc`6?DFsB_)}(sYw5nV}2v?5`lv&kT#LP5O#hg?B#_8^&1eX{<8a@ig{oQ$5s+
zFPnC`B6)8C%!p8$zklHfRX#|dPm>=d3=@(BdPn(4>3;J!3Z%rX_?E6|wCGW8(!epo
zal#40Ny024M|hi%C-f5zipG}NDj%uygj0lf#jTb)XL&Ql-IlgMfvl$q?-T4qhlNjX
za6aDR4!sjJF|Nk}#6|rqTq;Fc2NXCUavRpI|A1l|5MRiAob(Loz;~0r{Fvv&&l?t6
z4$y$T@XMrqOt`wzxq!|=acW~9EQm*Kfz~VJLi5-Wy3EH&f6gptM0b1k#RVFDIsw-6
z8X1<?b6(tQPd6|3<e#kcD?TZu)%Lwc9#l3KVeim$QeVa*g4N|~Pv##{1VQ}WerRQB
zO*Fi`wCm3+?bDThpV_7D*|c*L?Y-qljv2{`a<1hGZ{nS3e~uO$<n(vPv*No=cLEnl
zEuS-K_PcOcTz~mSyP!TL(H+k?*tOaVwzs~PoQjA^!8_jYlBR=wx_VgL?O5yJd(k8w
zc0`@_>pLszI~%+oqAlT?IcCsJC;B_}z<#vw0m2yq-R-<6E_bfoxE@u!sqnU1M%i8C
W4!ZsB)$R)XMcfshDzx;wkNgkYHNSNL

diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index bd2ba0a..b995478 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -6,8 +6,10 @@
 import os
 import csv
 from bibtexparser.bparser import BibTexParser
+import bibtexparser
 import string
 from queries import *
+import numpy as np
 
 def checkcites_output(aux_file):
     '''take in aux file for tex document, return list of citation keys
@@ -145,38 +147,62 @@ def find_unused_cites(paper_aux_file):
     unused_in_paper = checkcites_output(paper_aux_file)  # get citations in library not used in paper
     print("Unused citations: ", unused_in_paper.count('=>'))
 
-def get_bib_data(homedir, parser=""):
+def get_bib_data(filename, parser="bparser"):
     """
 
     :param homedir: home directory
            parser: a string telling which parser to use (default is not to use bparser)
     :return: bib_data
     """
-    ID = glob.glob(homedir + '*bib')
+    
     if parser == 'bparser':
-        bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)
+        bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(open(filename))
     else:
+        # this one will error if you have duplicates
         parser = bibtex.Parser()
-        bib_data = parser.parse_file(ID[0])
+        bib_data = parser.parse_file(filename)
 
     return bib_data
 
-def get_duplicates(bib_data):
+def get_duplicates(bib_data, filename):
     """
     take bib_data, and get duplicates
     :param homedir: home directory
-    :return:
+    :return: bib_data without duplicates
     """
 
     duplicates = []
-    for key in bib_data.entries.keys():
+    for key in bib_data.entries_dict.keys():
         count = str(bib_data.entries).count("'ID\': \'" + key + "\'")
         if count > 1:
             duplicates.append(key)
 
+            # remove from data
+            idx = np.where([x['ID'] == key for x in bib_data.entries])[0]
+            # remove first entry, so we keep that one
+            idx = idx[1:]
+            for i in idx:
+                bib_data.entries.remove(bib_data.entries[i])
+
+        # check that we got the duplicate
+        if (str(bib_data.entries).count("'ID\': \'" + key + "\'")) > 1:
+            raise ValueError("Unable to successfully remove duplicates")
+
     if len(duplicates) > 0:
-        raise ValueError("In your .bib file, we found and removed duplicate entries for:",
-                         ' '.join(map(str, duplicates)))
+        print("In your .bib file, we found and removed duplicate entries for the following entries:\n " +
+                      ' '.join(map(str, duplicates)) +
+              "\n If this is incorrect, please edit you .bib file to give unique identifiers for all unique references:")
+
+        # write new data to file
+        new_bib = filename[:-4] + '_clean.bib'
+        with open(new_bib, 'w') as bibtex_file:
+            bibtexparser.dump(bib_data, bibtex_file)
+
+        # reparse
+        bib_data = get_bib_data(new_bib, "")
+    else:
+        bib_data = get_bib_data(filename, "")
+    return bib_data
 
 
 def get_names_published(homedir, bib_data, cr):
@@ -305,6 +331,7 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC
             author = bib_data.entries[key].persons['author']
         except:
             author = bib_data.entries[key].persons['editor']
+
         FA = author[0].rich_first_names
         LA = author[-1].rich_first_names
         FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace(
@@ -328,12 +355,12 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC
 
         # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API
         try:
-            title = bib_data.entries[key].fields['title'].replace(',', '').\
+            title = bib_data.entries_dict[key].fields['title'].replace(',', '').\
                 replace(',', '').replace('{', '').replace('}','')
         except:
             title = ''
         try:
-            doi = bib_data.entries[key].fields['doi']
+            doi = bib_data.entries_dict[key].fields['doi']
         except:
             doi = ''
         if FA == '' or len(FA.split('.')[0]) <= 1:

From 18a7e28f45986ee3425c79bfcce88542b558663d Mon Sep 17 00:00:00 2001
From: murphyka <murphysics@gmail.com>
Date: Tue, 24 May 2022 14:50:03 -0400
Subject: [PATCH 12/47] Removed the R code and added code to check the number
 of credits allocated to the API key

---
 cleanBib.ipynb | 109 ++++++++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 46 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 4b9593d..01e4ded 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -187,50 +187,67 @@
    },
    "outputs": [],
    "source": [
-    "genderAPI_key <- '&key=YOUR ACCOUNT KEY HERE'\n",
-    "\n",
-    "fileConn<-file(\"genderAPIkey.txt\")\n",
-    "writeLines(c(genderAPI_key), fileConn)\n",
-    "close(fileConn)\n",
-    "\n",
-    "names=read.csv(\"/home/jovyan/cleanedBib.csv\",stringsAsFactors=F)\n",
-    "setwd('/home/jovyan/')\n",
-    "\n",
-    "require(rjson)\n",
-    "gendFA=NULL;gendLA=NULL\n",
-    "gendFA_conf=NULL;gendLA_conf=NULL\n",
-    "\n",
-    "namesIncompleteFA=NULL\n",
-    "namesIncompleteLA=NULL\n",
-    "incompleteKeys=list()\n",
-    "incompleteRows=list()\n",
-    "\n",
-    "for(i in 1:nrow(names)){\n",
-    "  if (nchar(names$FA[i])<2 || grepl(\"\\\\.\", names$FA[i])){\n",
-    "    namesIncompleteFA[i] = i+1\n",
-    "    incompleteKeys = c(incompleteKeys, names$CitationKey[i])\n",
-    "    incompleteRows = c(incompleteRows, i+1)\n",
-    "  }\n",
-    "  namesIncompleteFA = namesIncompleteFA[!is.na(namesIncompleteFA)]\n",
-    "    \n",
-    "  if (nchar(names$LA[i])<2 || grepl(\"\\\\.\", names$LA[i])){\n",
-    "    namesIncompleteLA[i] = i+1\n",
-    "    incompleteKeys = c(incompleteKeys, names$CitationKey[i])\n",
-    "    incompleteRows = c(incompleteRows, i+1)\n",
-    "  }\n",
-    "  namesIncompleteLA = namesIncompleteLA[!is.na(namesIncompleteLA)]\n",
-    "}\n",
-    "\n",
-    "if (length(names$CitationKey[which(names$SelfCite==\"Y\")]>0)){\n",
-    "    print(paste(\"STOP: Please remove self-citations. Then, re-run steps 2 and 3. Here are some suggestions to check for with the following citation keys in your .bib file: \"))\n",
-    "    print(paste(names$CitationKey[which(names$SelfCite==\"Y\")]))\n",
-    "}\n",
-    "\n",
-    "if (length(namesIncompleteFA)>0 || length(namesIncompleteLA)>0){\n",
-    "    print(paste(\"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. Here are some suggestions to check for with the following citation keys in your .bib file: \"))\n",
-    "    print(paste(incompleteKeys))\n",
-    "    print(paste(\"Only continue if you've ran steps 2 and 3, and this code for step 3 no longer returns errors. For accuracy, please revise any incomplete names in the citations of your .bib file as indicated above. For more info, see rows\", paste(unique(c(namesIncompleteFA, namesIncompleteLA))), \"of cleanedBib.csv\"))\n",
-    "}"
+    "# Do a final check on the bibliography entries\n",
+    "with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:\n",
+    "    names_csv = csv.reader(csvfile)\n",
+    "    names_db = []\n",
+    "    for row in names_csv:\n",
+    "        names_db.append(row)\n",
+    "\n",
+    "incomplete_name_bib_keys, self_cite_bib_keys = [[], []]\n",
+    "authors_full_list = []\n",
+    "for row in names_db[1:]:  # Skip the first row, it's just headers\n",
+    "    # Check that the authors' names have at least 2 characters and no periods\n",
+    "    row_id, first_author, last_author, _, self_cite, bib_key = row\n",
+    "    authors_full_list.append(first_author)  # For counting the number of query calls needed\n",
+    "    authors_full_list.append(last_author)\n",
+    "    if len(first_author)<2 or len(last_author)<2 or '.' in first_author+last_author:\n",
+    "        incomplete_name_bib_keys.append(bib_key)\n",
+    "    if self_cite == 'Y':\n",
+    "        self_cite_bib_keys.append(bib_key)\n",
+    "        \n",
+    "if len(self_cite_bib_keys) > 0:\n",
+    "    warning_message = \"STOP: Please remove self-citations. Then, re-run steps 2 and 3. \"\n",
+    "    warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n",
+    "    print(warning_message)\n",
+    "    print(self_cite_bib_keys)\n",
+    "\n",
+    "\n",
+    "if len(incomplete_name_bib_keys) > 0:\n",
+    "    warning_message = \"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. \"\n",
+    "    warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n",
+    "    print(warning_message)\n",
+    "    print(incomplete_name_bib_keys)\n",
+    "\n",
+    "final_warning_message = \"Only continue if you've ran steps 2 and 3,\"\n",
+    "final_warning_message += \" and this code for step 3 no longer returns errors.\"\n",
+    "print(final_warning_message)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "kernel": "R"
+   },
+   "outputs": [],
+   "source": [
+    "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n",
+    "\n",
+    "# TODO: Remove in the PR that gets rid of argparse. \n",
+    "# The following saves the api key to a txt file just to be reloaded by the next cell\n",
+    "with open(\"genderAPIkey.txt\", 'w') as f:\n",
+    "    f.write(genderAPI_key)\n",
+    "\n",
+    "# Check your credit balance\n",
+    "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n",
+    "response = urlopen(url)\n",
+    "decoded = response.read().decode('utf-8')\n",
+    "decoded_json = json.loads(decoded)\n",
+    "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n",
+    "print('This should use (at most) %d credits, '%len(np.unique(authors_full_list)) + \\\n",
+    "      'saving you approx %d'%(len(authors_full_list)-len(np.unique(authors_full_list))) + \\\n",
+    "      ' credits if results are stored.')"
    ]
   },
   {
@@ -718,9 +735,9 @@
     "displayed": true,
     "height": 0
    },
-   "version": "0.21.7"
+   "version": "0.20.1"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}

From 09388d02b65c75987e9dd76cd1a566ecde38889f Mon Sep 17 00:00:00 2001
From: Jennifer Stiso <jeni.stiso@gmail.com>
Date: Mon, 13 Jun 2022 13:51:22 -0400
Subject: [PATCH 13/47] adding bibcheck to pipeline, defining API functions

---
 cleanBib.ipynb                                |  70 ++----
 tests/aux/pipeline.py                         |   3 +-
 tests/erroneous/pipeline.py                   |   3 +-
 tests/immaculate/cleanedBib.csv               |  28 +--
 tests/immaculate/pipeline.py                  |   3 +-
 .../__pycache__/preprocessing.cpython-39.pyc  | Bin 13808 -> 14879 bytes
 utils/__pycache__/queries.cpython-39.pyc      | Bin 838 -> 1893 bytes
 utils/preprocessing.py                        |  36 ++-
 utils/queries.py                              | 222 ++++++++++++++++++
 9 files changed, 295 insertions(+), 70 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 01e4ded..3decec8 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -148,10 +148,18 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
-    "kernel": "SoS"
+    "kernel": "R"
    },
+   "outputs": [],
+   "source": [
+    "bib_check(homedir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "source": [
     "## 3. Estimate gender and race of authors from cleaned bibliography\n",
     "\n",
@@ -168,61 +176,19 @@
     "\n",
     "![save button](img/saveBib.png)\n",
     "\n",
-    "Common issues include: \n",
+    "Common issues include:\n",
     "\n",
-    "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\" \n",
-    "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person. \n",
-    "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown. \n",
+    "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n",
+    "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n",
+    "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown.\n",
     "\n",
     "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n",
     "\n",
     "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   ],
    "metadata": {
-    "kernel": "R"
-   },
-   "outputs": [],
-   "source": [
-    "# Do a final check on the bibliography entries\n",
-    "with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:\n",
-    "    names_csv = csv.reader(csvfile)\n",
-    "    names_db = []\n",
-    "    for row in names_csv:\n",
-    "        names_db.append(row)\n",
-    "\n",
-    "incomplete_name_bib_keys, self_cite_bib_keys = [[], []]\n",
-    "authors_full_list = []\n",
-    "for row in names_db[1:]:  # Skip the first row, it's just headers\n",
-    "    # Check that the authors' names have at least 2 characters and no periods\n",
-    "    row_id, first_author, last_author, _, self_cite, bib_key = row\n",
-    "    authors_full_list.append(first_author)  # For counting the number of query calls needed\n",
-    "    authors_full_list.append(last_author)\n",
-    "    if len(first_author)<2 or len(last_author)<2 or '.' in first_author+last_author:\n",
-    "        incomplete_name_bib_keys.append(bib_key)\n",
-    "    if self_cite == 'Y':\n",
-    "        self_cite_bib_keys.append(bib_key)\n",
-    "        \n",
-    "if len(self_cite_bib_keys) > 0:\n",
-    "    warning_message = \"STOP: Please remove self-citations. Then, re-run steps 2 and 3. \"\n",
-    "    warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n",
-    "    print(warning_message)\n",
-    "    print(self_cite_bib_keys)\n",
-    "\n",
-    "\n",
-    "if len(incomplete_name_bib_keys) > 0:\n",
-    "    warning_message = \"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. \"\n",
-    "    warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n",
-    "    print(warning_message)\n",
-    "    print(incomplete_name_bib_keys)\n",
-    "\n",
-    "final_warning_message = \"Only continue if you've ran steps 2 and 3,\"\n",
-    "final_warning_message += \" and this code for step 3 no longer returns errors.\"\n",
-    "print(final_warning_message)"
-   ]
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
@@ -740,4 +706,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file
diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py
index 4bd4640..c524166 100644
--- a/tests/aux/pipeline.py
+++ b/tests/aux/pipeline.py
@@ -34,4 +34,5 @@
     # find and print duplicates
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
-    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
+bib_check(homedir)
\ No newline at end of file
diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py
index 4bd4640..c524166 100644
--- a/tests/erroneous/pipeline.py
+++ b/tests/erroneous/pipeline.py
@@ -34,4 +34,5 @@
     # find and print duplicates
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
-    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
+bib_check(homedir)
\ No newline at end of file
diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv
index 7f322e7..9b5f43b 100644
--- a/tests/immaculate/cleanedBib.csv
+++ b/tests/immaculate/cleanedBib.csv
@@ -1,15 +1,15 @@
 Article,FA,LA,Title,SelfCite,CitationKey
-2,Danielle,Perry,,N,bassett2022curious
-3,Gyorgy,Edvard,,N,buzsaki2013memory
-4,Paula,Rachel,,N,chatterjee2021gender
-5,Andrew,Michelle,,N,ethnicolr2022asian
-6,Denzel,Ketanji,,N,ethnicolr2022black
-7,Rafael,Alexandria,,N,ethnicolr2022hispanic
-8,Nicolas,Meryl,,N,ethnicolr2022white
-9,Danielle,Jennifer,,Y,fake2022
-10,Jacqueline,Bradley,,N,fulvio2021imbalance
-11,,JH,,N,jurafsky2018n
-12,,Dina,,N,Lundine2019
-13,Sara,Holly,,N,mitchell2013gendered
-14,William,William,,N,moralia2005
-15,Perry,Danielle,,N,zurn2020network
+2,Gyorgy,Edvard,,N,buzsaki2013memory
+3,,Dina,,N,Lundine2019
+4,Perry,Danielle,,N,zurn2020network
+5,William,William,,N,moralia2005
+6,Danielle,Perry,,N,bassett2022curious
+7,Danielle,Jennifer,,Y,fake2022
+8,,JH,,N,jurafsky2018n
+9,Sara,Holly,,N,mitchell2013gendered
+10,Paula,Rachel,,N,chatterjee2021gender
+11,Jacqueline,Bradley,,N,fulvio2021imbalance
+12,Denzel,Ketanji,,N,ethnicolr2022black
+13,Rafael,Alexandria,,N,ethnicolr2022hispanic
+14,Andrew,Michelle,,N,ethnicolr2022asian
+15,Nicolas,Meryl,,N,ethnicolr2022white
diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py
index 4bd4640..c524166 100644
--- a/tests/immaculate/pipeline.py
+++ b/tests/immaculate/pipeline.py
@@ -34,4 +34,5 @@
     # find and print duplicates
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
-    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
\ No newline at end of file
+    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
+bib_check(homedir)
\ No newline at end of file
diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc
index b95c194989df590caedee869c0a591ccecbeda2f..b68df308c35f5ef863048c99f4094963f4a58c53 100644
GIT binary patch
delta 1144
zcmZuvO=}ZD7@pZ|lHF~-EPjH@s6|cHpjA<NRYU~qM=Rp0EZfW^S+kjqGn1B<U4ukV
zf?5h*ZBKd<!IPjj@g)9$;K^U$#k=opTb0rs=9zcidESqCW`DI`tCC+VItacm>dox$
zsio3h{B!Bpdwgi=N40o>9(;n_eH2fl>evSde>~l<V@+QFDzI8GFTj+S;6x#X9~di{
z7*V`x4DgGI1S@kT8{<`+7)_L9V&fvZUrNk4vyzL=RgBTY!?zJdCX7J_BVIX$<du^M
zr{fM_QxD^wkz|uxvyYN|oS_CqPmMhYQnetJc`uh(iJdsfNK#0OaUN3JI(2F}+EV%0
z9$=bjBl!tzXTzpuY{$+nG>633I16#bG_F9iZ9{b9+=l027ou_l+^4y=`4S70=95xf
zqSiT-l#~^h2D*r)DQU4R6j)={sJ$f<7#L9}DaU0x0$IusT~2)n)Gkbk5JVM6HR$&d
z7d5=?Yt&D$I5|CY?fN9S9x`7tB3LJyV??sBaiSh5Uj-4DH8Rs;{5Tw*5Is($VqHll
zYGm>5WhNN$Valij&}%lCOhbr@NWI1CZPJJYnGaNps1}1w7)J8}Z}!PXcG6}Gk_4PA
zL_I-jv%xHB1R<Lwi?=p)HWxsH0k21$ZpaiQ>xCh3g;a$59VQ7>Vx6v9AaxdoTXmf_
zJ$8+U3s67eD&RfH*wCeqz^#S<_j1P<j}o6#XgZKUrHt^1gb{Bt0Wzqb;F2&QA|Y#4
zTin!YKGKltI<Bq90K=SMJ_X*W_H5H|6`)!X{bzdf9Kf~H=DoU{bFDR#&JHsons;~q
zyR4^AJP^mza=D}Wfjp4#Ki@XLr&`dEr*-kdKq@x}S-gO{g;XVPjo^-izU-^@-4J`O
z{WqJNUoYX7=Y7H9p*#>)`c3BUXaw93y-jYhL*FBXonQ@Z9iKk@5dJQC9(>5yjvZ`b
z3y&KGSSv;aJEy=k2Gha?qm&+6*!s`Xbny;?t61F7%l_qWXTIZi%dgHJ#uLk*&z?4{
F=|96|Je&Xk

delta 73
zcmbPV@*$fqk(ZZ?0SG3aZBAnIn8+u?cx$5e21dn+cQjcUgBdiHHVZNCH0IP~ED8jw
c(v+S2#o`>J=ww4);mNlx`4|l*zp^w201o{XHvj+t

diff --git a/utils/__pycache__/queries.cpython-39.pyc b/utils/__pycache__/queries.cpython-39.pyc
index 8e5b128322dd86be564bcfd5e2eee6253afb9213..01db4841ccd562f77608c750a828f537a8c8f6b8 100644
GIT binary patch
delta 1124
zcmbVK!EVz)5ZzrncH*X?M5qX;=n7FF0wo2(0VzUMEdnHHD_T@x2}RrNCUN4}>Doz1
zWh;mFiU9Eg^vIDf;KrHra)m1wet@x4phe=sS~ELu-i&v4XFeI<3I#SjZ6R16zwd9~
zG~X2F&Yl9q5y$(;Mx1aBcFgq+RLXp)ej(#JPQnjY9`-EAGDxrrOo-Z;;|R^BsV_;4
z9Tbz0ywf7w*4vo_yn>#8jsYb&(mLb}Y8$Z*5hmApW*<cmwSxBW0gf}N*Er@K<JbtZ
zQ6|c8GfCdVhbY_85_As5=AkF|4?YXLaV+164E`rScgX+AKf;gU04p-1Y?O<vrXG<<
ztD>EYM?KuCqA(XBK6M)LJjnDOc}3%?(0Vro@vyh^2O36Oh!=FEqdl;_;&t>I2X_|{
zPQPcMc>x~%#BSHhHGE0?UdQu?-k3Ip$A!E`8;<MJp%b>~HVXumEEIv28~J^30^^B+
zkhDVU%)1;?+4pGJasqlKY_TxO6%<hD`*hoB!cK!0lcHqhcIPV#L4~uBRaV7tNrq0{
z6)OO5EMC7|u8)I(B+%^=Rz^!OsN~o{B$N)akqksKNlg@?=lK}g&Cwb3hU_GhgwtQh
z^T59+ij)@>evzVyng_qKs;(1+b-&xYa~1Fp5VSkr9H+OhC08z2UBAxU;0_qSLjS9n
zWP&FrU-)e$waKIqxd7@~Vq5~H#`v+qPZi#BzM4?b$;nGidXiSPtCxEnSCQfNYH3=L
z9=Ed+-l<j(28wt+W%NB~r!Q=>{!AGI=JrKklcoDA!=z**rMbdWy59qB^qhLf6-wv*
zZqJ^1@bLbFdmH1;+WM2V&9&vVN2R<nTYgt?N7`nWIbMx7)YOquls*zn+S&;Gq`?$%
znXNZit()qht-GlWdZ=*YuLj6la9s)#%z;9gqNd{_p1FW$@hqX3rcxv`<P=UrS4%`v
R2$Z+p&S?u~kW4~${wG&T47UIP

delta 61
zcmaFLcZ`iMk(ZZ?0SL6U3z8&QCi2M$@c?-#3@MB)3{eazOu-DA%o|;9GqL(<G8QRJ
Mwq#dj{R~nM0Bv6j%>V!Z

diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index b995478..58b210b 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -471,6 +471,40 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor
     return selfCite
 
 
-
+def bib_check(homedir):
+    # Do a final check on the bibliography entries
+    with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:
+        names_csv = csv.reader(csvfile)
+        names_db = []
+        for row in names_csv:
+            names_db.append(row)
+
+    incomplete_name_bib_keys, self_cite_bib_keys = [[], []]
+    authors_full_list = []
+    for row in names_db[1:]:  # Skip the first row, it's just headers
+        # Check that the authors' names have at least 2 characters and no periods
+        row_id, first_author, last_author, _, self_cite, bib_key = row
+        authors_full_list.append(first_author)  # For counting the number of query calls needed
+        authors_full_list.append(last_author)
+        if len(first_author) < 2 or len(last_author) < 2 or '.' in first_author + last_author:
+            incomplete_name_bib_keys.append(bib_key)
+        if self_cite == 'Y':
+            self_cite_bib_keys.append(bib_key)
+
+    if len(self_cite_bib_keys) > 0:
+        warning_message = "STOP: Please remove self-citations. Then, re-run step 2. "
+        warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: "
+        print(warning_message)
+        print(self_cite_bib_keys)
+
+    if len(incomplete_name_bib_keys) > 0:
+        warning_message = "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. "
+        warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: "
+        print(warning_message)
+        print(incomplete_name_bib_keys)
+
+    final_warning_message = "Only continue if you've run steps 2,"
+    final_warning_message += " and this code no longer returns errors."
+    print(final_warning_message)
 
 
diff --git a/utils/queries.py b/utils/queries.py
index 7cac1ac..6985371 100644
--- a/utils/queries.py
+++ b/utils/queries.py
@@ -29,4 +29,226 @@ def namesFromXref(cr, doi, title, authorPos):
     return name
 
 
+def gender_base(homedir):
+	"""
+	for unknown gender, fill with base rates
+	you will never / can't run this (that file is too big to share)
+	"""
+	main_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0)
+
+
+	gender_base = {}
+	for year in np.unique(main_df.PY.values):
+		ydf = main_df[main_df.PY==year].AG
+		fa = np.array([x[0] for x in ydf.values])
+		la = np.array([x[1] for x in ydf.values])
+
+		fa_m = len(fa[fa=='M'])/ len(fa[fa!='U'])
+		fa_w = len(fa[fa=='W'])/ len(fa[fa!='U'])
+
+		la_m = len(la[fa=='M'])/ len(la[la!='U'])
+		la_w = len(la[fa=='W'])/ len(la[la!='U'])
+
+		gender_base[year] = [fa_m,fa_w,la_m,la_w]
+
+	gender_base[2020] = [fa_m,fa_w,la_m,la_w]
+
+	with open(homedir + '/data/gender_base' + '.pkl', 'wb') as f:
+		pickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL)
+
+
+def get_pred_demos(authors):
+    """
+
+    :param authors:
+    :return:
+    """
+    authors = authors.split(' ')
+    print('first author is %s %s ' % (authors[1], authors[0]))
+    print('last author is %s %s ' % (authors[3], authors[2]))
+    print("we don't count these, but check the predictions file to ensure your names did not slip through!")
+
+    citation_matrix = np.zeros((8, 8))
+
+    print('looping through your references, predicting gender and race')
+
+    columns = ['CitationKey', 'Author', 'Gender', 'W', 'A', 'GendCat']
+    paper_df = pd.DataFrame(columns=columns)
+
+    gender = []
+    race = []
+
+    idx = 0
+    for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)):
+        if 'author' not in bibfile.entries[paper].persons.keys():
+            continue  # some editorials have no authors
+        if 'year' not in bibfile.entries[paper].fields.keys():
+            year = 2020
+        else:
+            year = int(bibfile.entries[paper].fields['year'])
+
+        if year not in gender_base.keys():
+            gb = gender_base[1995]
+        else:
+            gb = gender_base[year]
+
+        fa = bibfile.entries[paper].persons['author'][0]
+        try:
+            fa_fname = fa.first_names[0]
+        except:
+            fa_fname = fa.last_names[0]  # for people like Plato
+        fa_lname = fa.last_names[0]
+
+        la = bibfile.entries[paper].persons['author'][-1]
+        try:
+            la_fname = la.first_names[0]
+        except:
+            la_fname = la.last_names[0]  # for people like Plato
+        la_lname = la.last_names[0]
+
+        if fa_fname.lower().strip() == authors[1].lower().strip():
+            if fa_lname.lower().strip() == authors[0].lower().strip():
+                continue
+
+        if fa_fname.lower().strip() == authors[3].lower().strip():
+            if fa_lname.lower().strip() == authors[2].lower().strip():
+                continue
+
+        if la_fname.lower().strip() == authors[1].lower().strip():
+            if la_lname.lower().strip() == authors[0].lower().strip():
+                continue
+
+        if la_fname.lower().strip() == authors[3].lower().strip():
+            if la_lname.lower().strip() == authors[2].lower().strip():
+                continue
+
+        fa_fname = convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        fa_lname = convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        la_fname = convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        la_lname = convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+
+        names = [{'lname': fa_lname, 'fname': fa_fname}]
+        fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
+        asian, hispanic, black, white = pred_fl_reg_name(fa_df, 'lname', 'fname').values[0][-4:]
+        fa_race = [white, asian, hispanic, black]
+
+        names = [{'lname': la_lname, 'fname': la_fname}]
+        la_df = pd.DataFrame(names, columns=['fname', 'lname'])
+        asian, hispanic, black, white = pred_fl_reg_name(la_df, 'lname', 'fname').values[0][-4:]
+        la_race = [white, asian, hispanic, black]
+
+        url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(fa_fname))
+        response = urlopen(url)
+        decoded = response.read().decode('utf-8')
+        fa_gender = json.loads(decoded)
+        if fa_gender['gender'] == 'female':
+            fa_g = [0, fa_gender['accuracy'] / 100.]
+        if fa_gender['gender'] == 'male':
+            fa_g = [fa_gender['accuracy'] / 100., 0]
+        if fa_gender['gender'] == 'unknown':
+            fa_g = gb[:2]
+
+        url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(la_fname))
+        response = urlopen(url)
+        decoded = response.read().decode('utf-8')
+        la_gender = json.loads(decoded)
+        if la_gender['gender'] == 'female':
+            la_g = [0, la_gender['accuracy'] / 100.]
+
+        if la_gender['gender'] == 'male':
+            la_g = [la_gender['accuracy'] / 100., 0]
+
+        if la_gender['gender'] == 'unknown':
+            la_g = gb[2:]
+
+        fa_data = np.array(
+            [paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['accuracy']), fa_race[0],
+             np.sum(fa_race[1:]), '']).reshape(1, 6)
+        paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)
+        la_data = np.array(
+            [paper, '%s,%s' % (la_fname, la_lname), '%s,%s' % (la_gender['gender'], la_gender['accuracy']), la_race[0],
+             np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1, 6)
+        paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)
+
+        mm = fa_g[0] * la_g[0]
+        wm = fa_g[1] * la_g[0]
+        mw = fa_g[0] * la_g[1]
+        ww = fa_g[1] * la_g[1]
+        mm, wm, mw, ww = [mm, wm, mw, ww] / np.sum([mm, wm, mw, ww])
+
+        gender.append([mm, wm, mw, ww])
+
+        ww = fa_race[0] * la_race[0]
+        aw = np.sum(fa_race[1:]) * la_race[0]
+        wa = fa_race[0] * np.sum(la_race[1:])
+        aa = np.sum(fa_race[1:]) * np.sum(la_race[1:])
+
+        race.append([ww, aw, wa, aa])
+
+        paper_matrix = np.zeros((2, 8))
+        paper_matrix[0] = np.outer(fa_g, fa_race).flatten()
+        paper_matrix[1] = np.outer(la_g, la_race).flatten()
+
+        paper_matrix = np.outer(paper_matrix[0], paper_matrix[1])
+
+        citation_matrix = citation_matrix + paper_matrix
+        idx = idx + 1
+
+    mm, wm, mw, ww = np.mean(gender, axis=0) * 100
+    WW, aw, wa, aa = np.mean(race, axis=0) * 100
+
+    return mm, wm, mw, ww, WW, aw, wa,aa
+
+def print_statements(mm, wm, mw, ww, WW, aw, wa,aa):
+    statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \
+    are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \
+    diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \
+    and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \
+    (and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \
+    MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \
+    databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \
+    Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \
+    first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \
+    color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \
+    is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \
+    it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \
+    We look forward to future work that could help us to better understand how to support equitable practices in science."
+
+    statement = statement.replace('MM', str(np.around(mm, 2)))
+    statement = statement.replace('WM', str(np.around(wm, 2)))
+    statement = statement.replace('MW', str(np.around(mw, 2)))
+    statement = statement.replace('ww', str(np.around(ww, 2)))
+    statement = statement.replace('WW', str(np.around(WW, 2)))
+    statement = statement.replace('AW', str(np.around(aw, 2)))
+    statement = statement.replace('WA', str(np.around(wa, 2)))
+    statement = statement.replace('AA', str(np.around(aa, 2)))
+
+    statementLatex = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \
+    are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\
+    diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \
+    and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \
+    (and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), \
+    MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \
+    databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \
+    Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \
+    first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of \
+    color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method \
+    is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \
+    it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \
+    We look forward to future work that could help us to better understand how to support equitable practices in science."
+
+    statementLatex = statementLatex.replace('MM', str(np.around(mm, 2)))
+    statementLatex = statementLatex.replace('WM', str(np.around(wm, 2)))
+    statementLatex = statementLatex.replace('MW', str(np.around(mw, 2)))
+    statementLatex = statementLatex.replace('ww', str(np.around(ww, 2)))
+    statementLatex = statementLatex.replace('WW', str(np.around(WW, 2)))
+    statementLatex = statementLatex.replace('AW', str(np.around(aw, 2)))
+    statementLatex = statementLatex.replace('WA', str(np.around(wa, 2)))
+    statementLatex = statementLatex.replace('AA', str(np.around(aa, 2)))
+
+    return statement, statementLatex
 

From 123d5c4833f51a82781497b924f6e02eb1301743 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 14:40:50 -0400
Subject: [PATCH 14/47] ipython req

---
 requirements.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ff17b13..63ded73 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,9 +34,6 @@ habanero==1.2.0
 idna==3.3
 importlib-metadata==4.11.3
 ipykernel==6.13.0
-ipython==8.3.0
-ipython-genutils==0.2.0
-ipywidgets==7.7.0
 jedi==0.18.1
 Jinja2==3.1.2
 jsonschema==4.4.0

From 85ecff6d2d76ae74a552cbf25efe5e336749b59b Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 14:45:28 -0400
Subject: [PATCH 15/47] pandas req

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 63ded73..4a59258 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,7 +62,7 @@ numpy==1.19.5
 oauthlib==3.2.0
 opt-einsum==3.3.0
 packaging==21.3
-pandas==1.4.2
+pandas==1.3.5
 pandocfilters==1.5.0
 parso==0.8.3
 pexpect==4.8.0

From c1d2b4f79a198cdc194754ee826cee12abbf6ec1 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 14:47:16 -0400
Subject: [PATCH 16/47] scipy req

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4a59258..57eb29f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -91,7 +91,7 @@ QtPy==2.1.0
 requests==2.27.1
 requests-oauthlib==1.3.1
 rsa==4.8
-scipy==1.8.0
+scipy==1.7.3
 seaborn==0.11.2
 Send2Trash==1.8.0
 six==1.15.0

From 2c33711473c02c81c0fa1126c52b509978675b9d Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 14:59:35 -0400
Subject: [PATCH 17/47] update versions

---
 requirements.txt | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 57eb29f..380d35f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ cffi==1.15.0
 charset-normalizer==2.0.12
 cycler==0.11.0
 debugpy==1.6.0
-decorator==5.1.1
+decorator==5.0
 defusedxml==0.7.1
 entrypoints==0.4
 ethnicolr==0.8.1
@@ -23,19 +23,22 @@ executing==0.8.3
 fastjsonschema==2.15.3
 flatbuffers==1.12
 fonttools==4.33.3
+folium==0.2.1
 future==0.18.2
 gast==0.4.0
-google-auth==2.6.6
+google-auth==2.0
 google-auth-oauthlib==0.4.6
 google-pasta==0.2.0
 grpcio==1.34.1
 h5py==3.1.0
 habanero==1.2.0
 idna==3.3
+imgaug==0.2.7
 importlib-metadata==4.11.3
-ipykernel==6.13.0
+ipykernel==4.10
+ipython==5.5.0
 jedi==0.18.1
-Jinja2==3.1.2
+Jinja2==3.0
 jsonschema==4.4.0
 jupyter==1.0.0
 jupyter-client==7.3.0
@@ -57,8 +60,8 @@ nbclient==0.6.0
 nbconvert==6.5.0
 nbformat==5.3.0
 nest-asyncio==1.5.5
-notebook==6.4.11
-numpy==1.19.5
+notebook==5.3.0
+numpy==1.21
 oauthlib==3.2.0
 opt-einsum==3.3.0
 packaging==21.3
@@ -88,7 +91,7 @@ PyYAML==6.0
 pyzmq==22.3.0
 qtconsole==5.3.0
 QtPy==2.1.0
-requests==2.27.1
+requests==2.23.0
 requests-oauthlib==1.3.1
 rsa==4.8
 scipy==1.7.3
@@ -105,14 +108,14 @@ tensorflow-estimator==2.5.0
 termcolor==1.1.0
 terminado==0.13.3
 tinycss2==1.1.1
-tornado==6.1
+tornado==5.1.0
 tqdm==4.64.0
 traitlets==5.1.1
 typing-extensions==3.7.4.3
 urllib3==1.26.9
 wcwidth==0.2.5
 webencodings==0.5.1
-Werkzeug==2.1.2
+Werkzeug==2.0
 widgetsnbextension==3.6.0
 wrapt==1.12.1
 zipp==3.8.0

From 100cc9d12b2ff5fbfe4f38f0876abe299d16a9fa Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 15:00:49 -0400
Subject: [PATCH 18/47] update versions

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 380d35f..07b2e74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ cffi==1.15.0
 charset-normalizer==2.0.12
 cycler==0.11.0
 debugpy==1.6.0
-decorator==5.0
+decorator==4.4.2
 defusedxml==0.7.1
 entrypoints==0.4
 ethnicolr==0.8.1

From f8a396491f24bdfca9a71d745259862b279f5db8 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 15:04:49 -0400
Subject: [PATCH 19/47] update versions

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 07b2e74..e51a72a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ backcall==0.2.0
 beautifulsoup4==4.11.1
 bibtexparser==1.2.0
 bleach==5.0.0
-cachetools==5.0.0
+cachetools==4.2.4
 certifi==2021.10.8
 cffi==1.15.0
 charset-normalizer==2.0.12

From 91bb1eac9a3e96234ecf75c11b74f6af6795f77d Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 15 Jul 2022 15:15:33 -0400
Subject: [PATCH 20/47] update versions

---
 requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e51a72a..1b88742 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,19 +26,19 @@ fonttools==4.33.3
 folium==0.2.1
 future==0.18.2
 gast==0.4.0
-google-auth==2.0
+google-auth==1.35.0
 google-auth-oauthlib==0.4.6
 google-pasta==0.2.0
 grpcio==1.34.1
 h5py==3.1.0
 habanero==1.2.0
 idna==3.3
-imgaug==0.2.7
+imgaug==0.2.6
 importlib-metadata==4.11.3
 ipykernel==4.10
 ipython==5.5.0
 jedi==0.18.1
-Jinja2==3.0
+Jinja2==2.11.3
 jsonschema==4.4.0
 jupyter==1.0.0
 jupyter-client==7.3.0
@@ -115,7 +115,7 @@ typing-extensions==3.7.4.3
 urllib3==1.26.9
 wcwidth==0.2.5
 webencodings==0.5.1
-Werkzeug==2.0
+Werkzeug==1.0.1
 widgetsnbextension==3.6.0
 wrapt==1.12.1
 zipp==3.8.0

From 476a5d89771749cc695e3b17e6016cd13482e4de Mon Sep 17 00:00:00 2001
From: Stiso <stisoj@fvfh14qqq05n.chop.edu>
Date: Fri, 22 Jul 2022 14:39:42 -0400
Subject: [PATCH 21/47] debugging pipeline for clean data

---
 .../immaculate/data/expected_matrix_florida.npy | Bin 0 -> 640 bytes
 tests/immaculate/data/expected_matrix_wiki.npy  | Bin 0 -> 640 bytes
 .../data/expected_small_matrix_florida.npy      | Bin 0 -> 160 bytes
 tests/immaculate/data/gender_base.pkl           | Bin 0 -> 1621 bytes
 tests/immaculate/pipeline.py                    |  16 +++++++++++++++-
 5 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100755 tests/immaculate/data/expected_matrix_florida.npy
 create mode 100644 tests/immaculate/data/expected_matrix_wiki.npy
 create mode 100755 tests/immaculate/data/expected_small_matrix_florida.npy
 create mode 100644 tests/immaculate/data/gender_base.pkl

diff --git a/tests/immaculate/data/expected_matrix_florida.npy b/tests/immaculate/data/expected_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9<VjDCcqHU}<p
zsJLb_YfbkGhho;F`_5dfa`?EW;U7<XsY9kz&0eX*8i&WxpE3`>+2CMwYNLVs|Ah{h
zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$-
zO>R`qa_|fMHTf8SzQe<sXk8VFN{1kpix%BQg$`FIUH>HRRqpUL*W~-t>AnuzWUb!b
zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p
zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy=
zut4FxE^F3g2fy7MZzeG=b<kA4-|CxF;Lx>bv$DHJk;6Z2PmfCsl@3=ArmSggnD4On
zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%<
zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{
z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5
o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^

literal 0
HcmV?d00001

diff --git a/tests/immaculate/data/expected_matrix_wiki.npy b/tests/immaculate/data/expected_matrix_wiki.npy
new file mode 100644
index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA<U
zgJ8n@lUx^9I$U`?SN*<1lY<STPs>!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW
zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d
zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T<G96&FPrcKX(UlJ`GKC
z1s{jU4%zv2eO3+)d;aVcPoL!Ax1#Fe*>@=p388%*sy`wf{MKkXwr&h{khpCdGrcg!
zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4
zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nE<D|{*ulF?
zyYiCHB!?|;lI)y3GaW8_IW+&9pWx7&wU|ANr`4e?K=r!A+Y$#o9=pr`76dzN%{V3T
zpgPb&DEn}um+=ILh(qd1xj_*Qith?jcnV`3`U7+guPyU*n6JOdXijRW1M7w(Ns((@
z99r{5yWR@>IxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR
l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK

literal 0
HcmV?d00001

diff --git a/tests/immaculate/data/expected_small_matrix_florida.npy b/tests/immaculate/data/expected_small_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae
GIT binary patch
literal 160
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ
F006H#DmwrG

literal 0
HcmV?d00001

diff --git a/tests/immaculate/data/gender_base.pkl b/tests/immaculate/data/gender_base.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44
GIT binary patch
literal 1621
zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B
zAvb*)iAYoj%<$w_g#jT)NFpL`z!<m{umLC9p-GD6tS|KR*Yi*BJ-7Gnd*9FV{XWn8
z?8$d26|xvh&j&PD!%kPqGgSOTM-|C@xiT#~RjN=(RkVhcbx4{fRnTW?jq6ehypyt3
z8Oe(g=hP57ni1`ElHNmeqnR3~Fj^!M3IB#?=_i_zP0J$O``rGmLw^c+@ido0Gckkn
z4z!CxhQzoPt_6t{vPB*_RoDN3LN{AK&)K(WS#k<9<%J0Mn*^PHv_T`D+e9IwPt%N$
zbDly|DB&R|U?YWYcQKS}SEOOuPtxKEuJ@%MfZDgz->NwcG}4eAzj6okvG|TOV0w$B
zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt
zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B
zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg
zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV=
zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9O<eI8I<U)S8rGE7Ybwd*}I
zTLR(ynd+z3XshA7+QXNhX(@*#n6@+=4D#e-IzrM4Eqg;EoE=k-b+{R20rgG#%zXh4
zqA9G1CC;#7Y9{F{qGwSlB>Lws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf<hyoRLPp`
zTj8J6v^%(M{VOopWaD<X?#q~tku>+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD
zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu
z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q
zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR
zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^
z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf
vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J

literal 0
HcmV?d00001

diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py
index c524166..83c1865 100644
--- a/tests/immaculate/pipeline.py
+++ b/tests/immaculate/pipeline.py
@@ -6,6 +6,8 @@
 wd = Path(os.getcwd())
 sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')
 from preprocessing import *
+from ethnicolr import pred_fl_reg_name
+import tensorflow as tf
 
 cr = Crossref()
 #homedir = '/home/jovyan/'
@@ -35,4 +37,16 @@
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
-bib_check(homedir)
\ No newline at end of file
+bib_check(homedir)
+
+# queries
+try:
+    f = open("genderAPIkey.txt", "r")
+    genderAPI_key = f.readline().replace('\n', '')
+except:
+    genderAPI_key = input("Enter genderAPI key:")
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)
+statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)
+print(statement)
\ No newline at end of file

From de95a8e20ff1e3caf7c1912f61184a5677c1802b Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 22 Jul 2022 15:31:11 -0400
Subject: [PATCH 22/47] removing duplicate queries

---
 cleanBib.ipynb                                | 253 +-----------------
 tests/aux/data/expected_matrix_florida.npy    | Bin 0 -> 640 bytes
 tests/aux/data/expected_matrix_wiki.npy       | Bin 0 -> 640 bytes
 .../data/expected_small_matrix_florida.npy    | Bin 0 -> 160 bytes
 tests/aux/data/gender_base.pkl                | Bin 0 -> 1621 bytes
 .../data/expected_matrix_florida.npy          | Bin 0 -> 640 bytes
 tests/erroneous/data/expected_matrix_wiki.npy | Bin 0 -> 640 bytes
 .../data/expected_small_matrix_florida.npy    | Bin 0 -> 160 bytes
 tests/erroneous/data/gender_base.pkl          | Bin 0 -> 1621 bytes
 .../__pycache__/preprocessing.cpython-310.pyc | Bin 0 -> 14445 bytes
 utils/__pycache__/queries.cpython-310.pyc     | Bin 0 -> 10186 bytes
 utils/queries.py                              | 151 ++++++-----
 12 files changed, 92 insertions(+), 312 deletions(-)
 create mode 100755 tests/aux/data/expected_matrix_florida.npy
 create mode 100644 tests/aux/data/expected_matrix_wiki.npy
 create mode 100755 tests/aux/data/expected_small_matrix_florida.npy
 create mode 100644 tests/aux/data/gender_base.pkl
 create mode 100755 tests/erroneous/data/expected_matrix_florida.npy
 create mode 100644 tests/erroneous/data/expected_matrix_wiki.npy
 create mode 100755 tests/erroneous/data/expected_small_matrix_florida.npy
 create mode 100644 tests/erroneous/data/gender_base.pkl
 create mode 100644 utils/__pycache__/preprocessing.cpython-310.pyc
 create mode 100644 utils/__pycache__/queries.cpython-310.pyc

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 3decec8..bcafb12 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -49,30 +49,16 @@
    },
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "import bibtexparser\n",
-    "from bibtexparser.bparser import BibTexParser\n",
     "import glob\n",
-    "import subprocess\n",
-    "import os\n",
-    "from pybtex.database.input import bibtex\n",
-    "import csv\n",
-    "from pylatexenc.latex2text import LatexNodes2Text \n",
-    "import unicodedata\n",
-    "import re\n",
-    "import pandas as pd\n",
     "from habanero import Crossref\n",
-    "import string\n",
-    "from time import sleep\n",
-    "import tqdm\n",
-    "import matplotlib.pylab as plt\n",
-    "import matplotlib.gridspec as gridspec\n",
-    "import json\n",
-    "import pickle\n",
-    "from urllib.request import urlopen\n",
-    "from urllib.parse import quote\n",
-    "from ethnicolr import census_ln, pred_census_ln,pred_wiki_name\n",
-    "from pybtex.database import parse_file\n",
+    "import sys\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "wd = Path(os.getcwd())\n",
+    "sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')\n",
+    "from preprocessing import *\n",
+    "from ethnicolr import pred_fl_reg_name\n",
+    "import tensorflow as tf\n",
     "import seaborn as sns\n",
     "\n",
     "cr = Crossref()\n",
@@ -243,226 +229,9 @@
     "\n",
     "import tensorflow as tf\n",
     "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
-    "import argparse\n",
-    "parser = argparse.ArgumentParser()\n",
-    "parser.add_argument('-bibfile',action='store',dest='bibfile',default=' '.join(bib_files))\n",
-    "parser.add_argument('-homedir',action='store',dest='homedir',default='/home/jovyan/')\n",
-    "parser.add_argument('-authors',action='store',dest='authors', default=(yourFirstAuthor+' '+yourLastAuthor).replace(',',''))\n",
-    "parser.add_argument('-method',action='store',dest='method',default='florida')\n",
-    "parser.add_argument('-font',action='store',dest='font',default='Palatino') # hey, we all have our favorite\n",
-    "parser.add_argument('-gender_key',action='store',dest='gender_key',default=genderAPI_key)\n",
-    "r = parser.parse_args()\n",
-    "locals().update(r.__dict__)\n",
-    "bibfile = bib_data\n",
-    "\n",
-    "\n",
-    "def gender_base():\n",
-    "\t\"\"\"\n",
-    "\tfor unknown gender, fill with base rates\n",
-    "\tyou will never / can't run this (that file is too big to share)\n",
-    "\t\"\"\"\n",
-    "\tmain_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0)\n",
-    "\n",
-    "\n",
-    "\tgender_base = {}\n",
-    "\tfor year in np.unique(main_df.PY.values):\n",
-    "\t\tydf = main_df[main_df.PY==year].AG\n",
-    "\t\tfa = np.array([x[0] for x in ydf.values])\n",
-    "\t\tla = np.array([x[1] for x in ydf.values])\n",
-    "\n",
-    "\t\tfa_m = len(fa[fa=='M'])/ len(fa[fa!='U'])\n",
-    "\t\tfa_w = len(fa[fa=='W'])/ len(fa[fa!='U'])\n",
-    "\n",
-    "\t\tla_m = len(la[fa=='M'])/ len(la[la!='U'])\n",
-    "\t\tla_w = len(la[fa=='W'])/ len(la[la!='U'])\n",
-    "\n",
-    "\t\tgender_base[year] = [fa_m,fa_w,la_m,la_w]\n",
-    "\n",
-    "\tgender_base[2020] = [fa_m,fa_w,la_m,la_w]\n",
-    "\n",
-    "\twith open(homedir + '/data/gender_base' + '.pkl', 'wb') as f:\n",
-    "\t\tpickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL)\n",
-    "\n",
-    "\n",
-    "with open(homedir + 'data/gender_base' + '.pkl', 'rb') as f:\n",
-    "\tgender_base =  pickle.load(f)\n",
-    "\n",
-    "authors = authors.split(' ')\n",
-    "print ('first author is %s %s '%(authors[1],authors[0]))\n",
-    "print ('last author is %s %s '%(authors[3],authors[2]))\n",
-    "print (\"we don't count these, but check the predictions file to ensure your names did not slip through!\")\n",
-    "\n",
-    "citation_matrix = np.zeros((8,8))\n",
-    "matrix_idxs = {'white_m':0,'api_m':1,'hispanic_m':2,'black_m':3,'white_f':4,'api_f':5,'hispanic_f':6,'black_f':7}\n",
-    "\n",
-    "asian = [0,1,2]\n",
-    "black = [3,4]\n",
-    "white = [5,6,7,8,9,11,12]\n",
-    "hispanic = [10]\n",
-    "print ('looping through your references, predicting gender and race')\n",
-    "\n",
-    "columns=['CitationKey','Author','Gender','W','A', 'GendCat']\n",
-    "paper_df = pd.DataFrame(columns=columns)\n",
-    "\n",
-    "gender = []\n",
-    "race = []\n",
-    "\n",
-    "\n",
-    "idx = 0\n",
-    "for paper in tqdm.tqdm(bibfile.entries,total=len(bibfile.entries)): \n",
-    "\tif 'author' not in bibfile.entries[paper].persons.keys():\n",
-    "\t\tcontinue #some editorials have no authors\n",
-    "\tif 'year' not in bibfile.entries[paper].fields.keys():\n",
-    "\t\tyear = 2020\n",
-    "\telse: year = int(bibfile.entries[paper].fields['year'])  \n",
-    "\t\n",
-    "\tif year not in gender_base.keys():\n",
-    "\t\tgb = gender_base[1995]\n",
-    "\telse:\n",
-    "\t\tgb = gender_base[year]\n",
-    "\n",
-    "\tfa = bibfile.entries[paper].persons['author'][0]\n",
-    "\ttry:fa_fname = fa.first_names[0] \n",
-    "\texcept:fa_fname = fa.last_names[0] #for people like Plato\n",
-    "\tfa_lname = fa.last_names[0] \n",
-    "\n",
-    "\tla = bibfile.entries[paper].persons['author'][-1]\n",
-    "\ttry:la_fname = la.first_names[0] \n",
-    "\texcept:la_fname = la.last_names[0] #for people like Plato\n",
-    "\tla_lname = la.last_names[0]\n",
-    "\n",
-    "\tif fa_fname.lower().strip() == authors[1].lower().strip():\n",
-    "\t\tif fa_lname.lower().strip()  == authors[0].lower().strip() :\n",
-    "\t\t\tcontinue\n",
-    "\n",
-    "\tif fa_fname.lower().strip()  == authors[3].lower().strip() :\n",
-    "\t\tif fa_lname.lower().strip()  == authors[2].lower().strip() :\n",
-    "\t\t\tcontinue\n",
-    "\n",
-    "\tif la_fname.lower().strip()  == authors[1].lower().strip() :\n",
-    "\t\tif la_lname.lower().strip()  == authors[0].lower().strip() :\n",
-    "\t\t\tcontinue\n",
-    "\t\n",
-    "\tif la_fname.lower().strip()  == authors[3].lower().strip() :\n",
-    "\t\tif la_lname.lower().strip()  == authors[2].lower().strip() :\n",
-    "\t\t\tcontinue\n",
-    "\n",
-    "\tfa_fname = convertLatexSpecialChars(str(fa_fname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n",
-    "\tfa_lname = convertLatexSpecialChars(str(fa_lname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n",
-    "\tla_fname = convertLatexSpecialChars(str(la_fname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'') \n",
-    "\tla_lname = convertLatexSpecialChars(str(la_lname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n",
-    "\n",
-    "\tnames = [{'lname': fa_lname,'fname':fa_fname}]\n",
-    "\tfa_df = pd.DataFrame(names,columns=['fname','lname'])\n",
-    "\tasian,hispanic,black,white = pred_fl_reg_name(fa_df,'lname','fname').values[0][-4:]\n",
-    "\tfa_race = [white,asian,hispanic,black]\n",
-    "\t\n",
-    "\tnames = [{'lname': la_lname,'fname':la_fname}]\n",
-    "\tla_df = pd.DataFrame(names,columns=['fname','lname'])\n",
-    "\tasian,hispanic,black,white = pred_fl_reg_name(la_df,'lname','fname').values[0][-4:]\n",
-    "\tla_race = [white,asian,hispanic,black]\n",
-    "\n",
-    "\turl = \"https://gender-api.com/get?key=\" + gender_key + \"&name=%s\" %(quote(fa_fname))\n",
-    "\tresponse = urlopen(url)\n",
-    "\tdecoded = response.read().decode('utf-8')\n",
-    "\tfa_gender = json.loads(decoded)\n",
-    "\tif fa_gender['gender'] == 'female':\n",
-    "\t\tfa_g = [0,fa_gender['accuracy']/100.]\n",
-    "\tif fa_gender['gender'] == 'male':\n",
-    "\t\tfa_g = [fa_gender['accuracy']/100.,0]\n",
-    "\tif fa_gender['gender'] == 'unknown':\n",
-    "\t\tfa_g = gb[:2]\n",
-    "\n",
-    "\turl = \"https://gender-api.com/get?key=\" + gender_key + \"&name=%s\" %(quote(la_fname))\n",
-    "\tresponse = urlopen(url)\n",
-    "\tdecoded = response.read().decode('utf-8')\n",
-    "\tla_gender = json.loads(decoded)\n",
-    "\tif la_gender['gender'] == 'female':\n",
-    "\t\tla_g = [0,la_gender['accuracy']/100.]\n",
-    "\t\n",
-    "\tif la_gender['gender'] == 'male':\n",
-    "\t\tla_g = [la_gender['accuracy']/100.,0]\n",
-    "\n",
-    "\tif la_gender['gender'] == 'unknown':\n",
-    "\t\tla_g = gb[2:] \n",
-    "\t\n",
-    "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:]), '']).reshape(1,6)\n",
-    "\tpaper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)\n",
-    "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1,6)\n",
-    "\tpaper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)\n",
-    "\n",
-    "\tmm = fa_g[0]*la_g[0]\n",
-    "\twm = fa_g[1]*la_g[0]\n",
-    "\tmw = fa_g[0]*la_g[1]\n",
-    "\tww = fa_g[1]*la_g[1]\n",
-    "\tmm,wm,mw,ww = [mm,wm,mw,ww]/np.sum([mm,wm,mw,ww])\n",
-    "\t\n",
-    "\tgender.append([mm,wm,mw,ww])\n",
-    "\n",
-    "\tww = fa_race[0] * la_race[0]\n",
-    "\taw = np.sum(fa_race[1:]) * la_race[0]\n",
-    "\twa = fa_race[0] * np.sum(la_race[1:])\n",
-    "\taa = np.sum(fa_race[1:]) * np.sum(la_race[1:])\n",
-    "\n",
-    "\trace.append([ww,aw,wa,aa])\n",
-    "\n",
-    "\tpaper_matrix = np.zeros((2,8))\n",
-    "\tpaper_matrix[0] = np.outer(fa_g,fa_race).flatten() \n",
-    "\tpaper_matrix[1] = np.outer(la_g,la_race).flatten() \n",
-    "\n",
-    "\tpaper_matrix = np.outer(paper_matrix[0],paper_matrix[1]) \n",
-    "\n",
-    "\tcitation_matrix = citation_matrix + paper_matrix\n",
-    "\tidx = idx + 1\n",
-    "\n",
-    "mm,wm,mw,ww = np.mean(gender,axis=0)*100\n",
-    "WW,aw,wa,aa = np.mean(race,axis=0)*100\n",
-    "\n",
-    "statement = \"Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \\\n",
-    "are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \\\n",
-    "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \\\n",
-    "and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \\\n",
-    "(and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \\\n",
-    "MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \\\n",
-    "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \\\n",
-    "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \\\n",
-    "first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \\\n",
-    "color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \\\n",
-    "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \\\n",
-    "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \\\n",
-    "We look forward to future work that could help us to better understand how to support equitable practices in science.\"\n",
-    "\n",
-    "statement = statement.replace('MM',str(np.around(mm,2)))\n",
-    "statement = statement.replace('WM',str(np.around(wm,2)))\n",
-    "statement = statement.replace('MW',str(np.around(mw,2)))\n",
-    "statement = statement.replace('ww',str(np.around(ww,2)))\n",
-    "statement = statement.replace('WW',str(np.around(WW,2)))\n",
-    "statement = statement.replace('AW',str(np.around(aw,2)))\n",
-    "statement = statement.replace('WA',str(np.around(wa,2)))\n",
-    "statement = statement.replace('AA',str(np.around(aa,2)))\n",
-    "\n",
-    "statementLatex = \"Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \\\n",
-    "are under-cited relative to the number of such papers in the field \\cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\\\n",
-    "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \\\n",
-    "and last author of each reference by using databases that store the probability of a first name being carried by a woman \\cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \\\n",
-    "(and excluding self-citations to the first and last authors of our current paper), our references contain ww\\% woman(first)/woman(last), \\\n",
-    "MW\\% man/woman, WM\\% woman/man, and MM\\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \\\n",
-    "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \\\n",
-    "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \\\n",
-    "first and last name being carried by an author of color \\cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\\% author of \\\n",
-    "color (first)/author of color(last), WA\\% white author/author of color, AW\\% author of color/white author, and WW\\% white author/white author. This method \\\n",
-    "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \\\n",
-    "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \\\n",
-    "We look forward to future work that could help us to better understand how to support equitable practices in science.\"\n",
-    "\n",
-    "statementLatex = statementLatex.replace('MM',str(np.around(mm,2)))\n",
-    "statementLatex = statementLatex.replace('WM',str(np.around(wm,2)))\n",
-    "statementLatex = statementLatex.replace('MW',str(np.around(mw,2)))\n",
-    "statementLatex = statementLatex.replace('ww',str(np.around(ww,2)))\n",
-    "statementLatex = statementLatex.replace('WW',str(np.around(WW,2)))\n",
-    "statementLatex = statementLatex.replace('AW',str(np.around(aw,2)))\n",
-    "statementLatex = statementLatex.replace('WA',str(np.around(wa,2)))\n",
-    "statementLatex = statementLatex.replace('AA',str(np.around(aa,2)))"
+    "\n",
+    "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, gender_key)\n",
+    "statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)"
    ]
   },
   {
diff --git a/tests/aux/data/expected_matrix_florida.npy b/tests/aux/data/expected_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9<VjDCcqHU}<p
zsJLb_YfbkGhho;F`_5dfa`?EW;U7<XsY9kz&0eX*8i&WxpE3`>+2CMwYNLVs|Ah{h
zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$-
zO>R`qa_|fMHTf8SzQe<sXk8VFN{1kpix%BQg$`FIUH>HRRqpUL*W~-t>AnuzWUb!b
zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p
zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy=
zut4FxE^F3g2fy7MZzeG=b<kA4-|CxF;Lx>bv$DHJk;6Z2PmfCsl@3=ArmSggnD4On
zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%<
zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{
z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5
o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^

literal 0
HcmV?d00001

diff --git a/tests/aux/data/expected_matrix_wiki.npy b/tests/aux/data/expected_matrix_wiki.npy
new file mode 100644
index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA<U
zgJ8n@lUx^9I$U`?SN*<1lY<STPs>!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW
zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d
zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T<G96&FPrcKX(UlJ`GKC
z1s{jU4%zv2eO3+)d;aVcPoL!Ax1#Fe*>@=p388%*sy`wf{MKkXwr&h{khpCdGrcg!
zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4
zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nE<D|{*ulF?
zyYiCHB!?|;lI)y3GaW8_IW+&9pWx7&wU|ANr`4e?K=r!A+Y$#o9=pr`76dzN%{V3T
zpgPb&DEn}um+=ILh(qd1xj_*Qith?jcnV`3`U7+guPyU*n6JOdXijRW1M7w(Ns((@
z99r{5yWR@>IxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR
l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK

literal 0
HcmV?d00001

diff --git a/tests/aux/data/expected_small_matrix_florida.npy b/tests/aux/data/expected_small_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae
GIT binary patch
literal 160
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ
F006H#DmwrG

literal 0
HcmV?d00001

diff --git a/tests/aux/data/gender_base.pkl b/tests/aux/data/gender_base.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44
GIT binary patch
literal 1621
zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B
zAvb*)iAYoj%<$w_g#jT)NFpL`z!<m{umLC9p-GD6tS|KR*Yi*BJ-7Gnd*9FV{XWn8
z?8$d26|xvh&j&PD!%kPqGgSOTM-|C@xiT#~RjN=(RkVhcbx4{fRnTW?jq6ehypyt3
z8Oe(g=hP57ni1`ElHNmeqnR3~Fj^!M3IB#?=_i_zP0J$O``rGmLw^c+@ido0Gckkn
z4z!CxhQzoPt_6t{vPB*_RoDN3LN{AK&)K(WS#k<9<%J0Mn*^PHv_T`D+e9IwPt%N$
zbDly|DB&R|U?YWYcQKS}SEOOuPtxKEuJ@%MfZDgz->NwcG}4eAzj6okvG|TOV0w$B
zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt
zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B
zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg
zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV=
zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9O<eI8I<U)S8rGE7Ybwd*}I
zTLR(ynd+z3XshA7+QXNhX(@*#n6@+=4D#e-IzrM4Eqg;EoE=k-b+{R20rgG#%zXh4
zqA9G1CC;#7Y9{F{qGwSlB>Lws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf<hyoRLPp`
zTj8J6v^%(M{VOopWaD<X?#q~tku>+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD
zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu
z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q
zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR
zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^
z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf
vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J

literal 0
HcmV?d00001

diff --git a/tests/erroneous/data/expected_matrix_florida.npy b/tests/erroneous/data/expected_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9<VjDCcqHU}<p
zsJLb_YfbkGhho;F`_5dfa`?EW;U7<XsY9kz&0eX*8i&WxpE3`>+2CMwYNLVs|Ah{h
zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$-
zO>R`qa_|fMHTf8SzQe<sXk8VFN{1kpix%BQg$`FIUH>HRRqpUL*W~-t>AnuzWUb!b
zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p
zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy=
zut4FxE^F3g2fy7MZzeG=b<kA4-|CxF;Lx>bv$DHJk;6Z2PmfCsl@3=ArmSggnD4On
zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%<
zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{
z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5
o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^

literal 0
HcmV?d00001

diff --git a/tests/erroneous/data/expected_matrix_wiki.npy b/tests/erroneous/data/expected_matrix_wiki.npy
new file mode 100644
index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3
GIT binary patch
literal 640
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA<U
zgJ8n@lUx^9I$U`?SN*<1lY<STPs>!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW
zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d
zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T<G96&FPrcKX(UlJ`GKC
z1s{jU4%zv2eO3+)d;aVcPoL!Ax1#Fe*>@=p388%*sy`wf{MKkXwr&h{khpCdGrcg!
zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4
zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nE<D|{*ulF?
zyYiCHB!?|;lI)y3GaW8_IW+&9pWx7&wU|ANr`4e?K=r!A+Y$#o9=pr`76dzN%{V3T
zpgPb&DEn}um+=ILh(qd1xj_*Qith?jcnV`3`U7+guPyU*n6JOdXijRW1M7w(Ns((@
z99r{5yWR@>IxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR
l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK

literal 0
HcmV?d00001

diff --git a/tests/erroneous/data/expected_small_matrix_florida.npy b/tests/erroneous/data/expected_small_matrix_florida.npy
new file mode 100755
index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae
GIT binary patch
literal 160
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ
F006H#DmwrG

literal 0
HcmV?d00001

diff --git a/tests/erroneous/data/gender_base.pkl b/tests/erroneous/data/gender_base.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44
GIT binary patch
literal 1621
zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B
zAvb*)iAYoj%<$w_g#jT)NFpL`z!<m{umLC9p-GD6tS|KR*Yi*BJ-7Gnd*9FV{XWn8
z?8$d26|xvh&j&PD!%kPqGgSOTM-|C@xiT#~RjN=(RkVhcbx4{fRnTW?jq6ehypyt3
z8Oe(g=hP57ni1`ElHNmeqnR3~Fj^!M3IB#?=_i_zP0J$O``rGmLw^c+@ido0Gckkn
z4z!CxhQzoPt_6t{vPB*_RoDN3LN{AK&)K(WS#k<9<%J0Mn*^PHv_T`D+e9IwPt%N$
zbDly|DB&R|U?YWYcQKS}SEOOuPtxKEuJ@%MfZDgz->NwcG}4eAzj6okvG|TOV0w$B
zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt
zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B
zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg
zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV=
zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9O<eI8I<U)S8rGE7Ybwd*}I
zTLR(ynd+z3XshA7+QXNhX(@*#n6@+=4D#e-IzrM4Eqg;EoE=k-b+{R20rgG#%zXh4
zqA9G1CC;#7Y9{F{qGwSlB>Lws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf<hyoRLPp`
zTj8J6v^%(M{VOopWaD<X?#q~tku>+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD
zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu
z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q
zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR
zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^
z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf
vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J

literal 0
HcmV?d00001

diff --git a/utils/__pycache__/preprocessing.cpython-310.pyc b/utils/__pycache__/preprocessing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0d8187dded3ab9a5f8648d7a7b8c294c4c3c80
GIT binary patch
literal 14445
zcmb_jTW}lKdEOfq3lIcAiWDhXR@RqTf+|s#;x-QBsF8KCl}HqA$rmAZP!MNHf&{SO
z*@Y+~i`FrnO77IPo0%r<G)<`_ZKF2Lv@`9r)6TRneQ92LnaRsOrZb(<Tc_?!T1_nV
z`~O|sD9VjH0kvn(p38sk|M~ul+|kjrg5Nj)^j{YKYfe%AjXnl{IzB#&$NLcip$N66
z6zQ$i)VfyG)F@3a>YQg34ZQVQqHY$=I4@aDa-LPR@HT3xdb*g7^G1pzoR=wP@J`f5
zFGuvU#hfrj@-3y97nVrjJtop(1n+T?5u<qT5?PVMdqU*J7~Yd&T<pSox0n!<cu$GP
z#FUtROD#@|Jz_6X_K1hXKD_sehsA!p9}*uEHs1ThBjQoK9~SB*rSSM|j8mB_sA2AW
z*>|qZxx(>|Uv{qfNHQ0y3rIsO^K5kiiSuRYITGLMp^8cs<6i=A6VE9;UK@e0tf)d2
zT7cQvqb%w8)`ann)=>j>Mc*<+;-*URt%NWU(|mJN>F9xWT6uk{WBAFy*i?nSYRIPo
zgD8mPs)o3}W#Jn$^c8K7a#K~6wTDGI(61`;(*Z^k=oryRyY81)9lP4F%dKnnVzuVj
zi>|aW5L>vFR^4g%hi&QjE!nVZRnNEGMY~e<%YM~u*sIQlm$vcqm&?9gmP}^`^AHhQ
zX!&--^@(_wR?!6ITC3d7R+gR0DkwSL48A2z5KW@z3%BLpNwoaMBTt8F`i^?!wuZd9
z_VY)6M!!NTOna?`rgSTg=Y_g#H9}+l-253ys}h>RsbF2hgx9Q9{m`IMhkDIvgl4(f
zbQ+?NC|a1T5=~ZUO2=!}{9?Y<XnBq(m8C3ilw^6mI6joJSZ$PRp?d9>A}7(xolhTq
z37f+^>iJdAT{(I>o@?)@aIPM$)SPkyo9JlEuhzVyP3gp)tu~fsnj7I*fAUJ0t!B&5
zf~Gf(V6Tx;)2gXiY6gGfs`iGZ8mghmDP&e?Cc8g6yapcLh60fQn-Cjk+E;uvP_U&m
zq0`pcRJX7lu?aQ2O?r2<K*I)7d80k)>pxw2-*{6kSnYC}x6Ev_EX#GfQLa0)w&zO>
z4O`G(#^$3mE<;<3ev7s8(yWawVI#LjCoduq?aNu4=d0AK7ACmN?IZ?u`-6M;?zOWA
z4%_q6^_`0Eh=T`0rJXoXIBXvXbr3Bi!s&|JxavrsPxYmyQ>m6~r<QSkLmgcUlURUS
zx#EO-;&OvUF1s&XKKpc-ez{z0IcKDFWkC-O>P%=*$HR2Ld%X5hup6C|k0Y@0cuyd(
zk`q`e?VhG_c-P2UKi-yN8F#tP5I><^+v6wSz(=5LDK{177%JYHuLbI+3XY-k0zWRa
zEgg%e3nStT@i%x!^sQ+fqmyVnit-6;h)<%d39b^=N^UEaLn#BLJ{zd7MfFUi5C=LN
zD3=u901wiplpryMjRcNkwW~uLpjK@>cmvFKSJ+>6Y!6c!@07zf@s{<fzYNyn*!8Lq
zU|clk;8k_^sv~%V+@}3~N44i9-D9_D$1|h3&D<s-b4UpFrrVTb6f;)bYD4Zvv|!S?
zZ#bcj^NeU2EI@69iN&f!%#7OGwwPf)YWf0qW)6LM8kG1@TBfDa-$6Y4@ucw>YF?Gc
zP-18;hwg@NBDcGiIAXMhIC#8<k6|^iszYl?--BzY<8ILDZopC1cqKp8TS+QAxRS$V
zX+6*IdJd=4s(zkVHKvQzJpcb#O}A0o*uHSQVDjS_vZTWhEtxz(-?UotAcBIyOUA1u
z4^b*FRzm(9#Sc?3L%~rBo}_?yZU4N5W6^$!mdK|yI*$V0J_OMkJ)v4j>&`yaN?1R^
zis{x}18XIZqrlKQQI}{tG4sqI;wziV7TD%Z+T&{b(D1Af%PQgVXNMRUe-t!HY&jfd
zrd9G?E?7{D={Nz^wv9tRj0z<@9@Wg4{5{k^O7Agun$Ld!I;xM8EU=%z&~Y|}j(0*B
z-19{H@u8kSpwC5p?(FF>QT8g;YG_uM8m@FCvD<>lgOQ&=JRE5?s&ti#vR@8UC{r)j
zs%_kvPJ@yqU0MY#G^ml#pi<k%X`{q49*@S7G54yHQiiTE3kf~$_wV=+I3+?`B7O&+
zwW`a5gs-BrvV?OW%s{DP_rGIwh&Nf_Y$-e{@YBKgJ5Aqsj6H<2bl|MBokWmeYE8UH
z_9(atfi*F6`b{p|F_E6#Aw3BWm=rm1K$AEi(en*Iv8i5Fyc)+%`WpC74#Z6#U-T`T
zbcR0(ZoYPiF5g1F{r5vFCEV()efr`#0%*<F0#M7cBY?^LDz2Ys*F`t3U-fI?xWqR(
zN)>wSb*Oxo`Fc)m@f48SDS~WK%|@%f;7F=UnLR}4F_65!efwawA)IUI(830FGCJ&h
zt{=jZg*wszulV}slOF0APr(QiYb{4^02Vqm+}*g?t94*96VS_da4**Z;L1x*d(?OR
za_vZj#k??q&epwRVySu+uuYv|j)Gd{dbPIE)(+bRE7U47Osu<d)sr7X7b3<&{5Z^Q
zxGi~>IQ_|}LpG&mDfK)qf+(?I$<I)7g6FU}Iy6VcWHb~{K1XHLYQ&L_;A7hk4=3r;
z^3F=P{;G5qFU2#xhLRq!G9w8nr|rdEF`-%jb<_CE<Ihyhdq4@QaZkIK)wH{ouDxNX
zlDOG@e1Ps0=2OH%yWnn9Rb!Y3et;be=>n(brn*T|8%YL6``FO_YL=UhEOl}3tlccr
z<?E8zKtPyRv_1Y3qu~C2n6=y5XHLuk5ha>{cs|3>Fi~+^4If88J505hi!pPxQR1BK
zvo!`7ElP1A79QTgm^?cFnS_NolJkg$$pvh6!0RDKLF~H666tCNOCY>EhT($=lEAeE
z)=0>gge*E_K7_9>kRsL|r4|zHuMbVlvRijVRnFQJu@S>ob>#+63BgzVqW;X<<u26q
zof-hflD)oMtt`jcG@TG4?1REtEJLWUs~*cwadJGULUfqBhZE9PiCAZJq+Y<dE}t!E
z#cai`*WE@bLe^e!I%3kLhTHJ`azm7*DES*r$15Zx%}O!NU5tbp!-bq9X~IHtK9XYf
z(4u9g-5Q$FXhN&o_V#7UEIEFuoB1`6@#u=Qtlg?%S|6lI9C-x^L(4;aoE~b`I}X;c
z1YQ>?E4Wo}D&WB9-&SaGR&<CXhEQ)PQ&_esS`@;yB>kQYaF5+!N!K^GlGl|*!b6F-
zEUI;ymz8iVbA2FG!nza(?GbQyl!3gAFi6DG^3U(w9=CPLPYG*NeN$aa`sqM}uw<eK
zU-d^sN~E_k{wV5dn>bJC<p}Y8mBl5Ip;pD{l+rQ%Y+#D)ruJ^Olk{_2`9KMhH*xx2
zf&6Y@r*7)1(y)#x-mk8sHvL_}<By?MZd2<38DfKuVH5Yha3`$uHeivyJO77mEVqpn
zu|*3KZ3S2)653-4?8A0%4}W)wG3JKIxt&%!dEngX1GDylJL&;u;P-JM+GM+EZAnU9
zLhoO<NJ0_H9_5Czs{N+6Mz@+;)Xtse?F8<i%{(b5Q2Le{>QFUAyNu+fF#QZk<=EoQ
z6MsoG3GPfQ?MG?f^+!4LOwFy7Yu*V&y2Tkx&b1G|)F>}NWWdq#S{2d*EVgO@r4fW5
z7+w42a}AqVzunVo9Jbd<Nz%fs(#*yJN575@or$bEUZhCDHM{86YVJCn_BaP9kgufn
zQrbR8w=WJl{R1qL#5@k$O=tkX3BXo-YQ5h$^_Z?T8`uhWqg@rCSzW9`m%#0nYqf3`
z?h{7>YB}EQ?VpeooG!5>N&^YibxAjGVLZ&lohpecZdikKIt;7BWSk-AP*9E|2(?BN
zm*z6alH!8|E%^v#j!|%d0z&1XUN1N0ixkVCEx@~&VHl!SZ}QQ{_4zUtzf6VF{UzhJ
zLcJ=k$<HIJm~1%f#7T-H5h3PJ(MBjzeclb(;bs3LWOz>@Fp^nZmn3fO*E6_0C)9nq
z1pzyw8)}vXZS!tEkwJXly|iKIX<Vst1r;mAgt|X!VgwH}rtcvHW76?}33U>9OoG;(
zB!NvFYs*(xNKm$Js&cOnRz)!!+eBc|9U~I&=<n|7q<nKF8Kg*!iI^oUfYEf2-byV2
zjD8FE53_;v5F7Z+5F4O2N4Vt-hoevxroqbAJJ}%10ChxUz#MZy4ymi1d@#C|3Gzfa
z$Zw4Tx~c?MS#<Y<OfN5q7?g})J?96@ZDqwML<DUsCvu=VCXB6oq+AWgBG!sDGZ+I<
z{dzD4`k+DdJ7d8JD2_#QKx)`{4kicYU;?U69(^#jwDZ{ic5v)qfGU8nxprl3PE2;j
zgAt7I&)!zrDX>HB2F4_IZyFu_6{Rt1DE|1yk0SQz?-GF0#75VCus&FN3Z;!+>HpcW
zbkd(7%3!>^P%=w&-%~eoJC)_N?vC^2PAb?XCRerhR9Wn;I**tyC@7%cvi!+lbW^2y
z>`c%u2onD802duXi0{EDm)tvCQcUgv>%>@%C0;Lo%AXD(NQj4~z)<5W2m2E<m1tx*
zcf6k~_C+Od+4Of=>X`DL>a7jU^*B=)U;BZ-C&-}w<f?&@{Hs5;iOVR+VCFyFnH(DN
z<2$Y_1WaKBlRNh}n0#Pw{(7i4@p#@-*RDrB@S(8xr=z}Y(q+4!D7~o$P*5z=h4QiX
z1-=s3mtC;?X0_>%UWZv_UrJ)E8DENBTy^&3VXkP$zw;*}ghe3wPL>&|Bc81;%v8Lq
z@{34heu}G#SQfQ$M?HK;4Q}aSN}x@MEsiXv)>@>5gl4grq$uu$TV^qNGLq6l?d-`=
zJAbm6xJ)uy(dvo=#Sw^KtaAUnvoY7s6Y>?1cDnps5^!o8B>mk6qrWFl<NLlFm>C!{
zSl#hux`9*C4SZFV&<DMY1zm%736rL^6JPp5=XL1KNPhwa#TC3lwGuI2)~PV#O+B;Z
zG{D{EiT@zFbj53L>{Z%N5JP6crqr9{fy=mCsXf61iAV3D`!t!MS}v%PB%ny~!^#{I
zorVTPgXuzw_ep=lkRlBgBvASdD7?UlFsKMt^!t5F=dtkR5~|)Um8G~SpN1q~Ly!mh
ze4il2KBax&!-vwt6%Xv3+#s@jJro&yWYA0NH9C$)We4eZGzd+TsZ`2-=ExBiua97w
zc=g-IUpe{W+_|~uX6-phUeT^$p?|&X*{~5@1so7P+(BwfyZjgkv*`vo(MF%K=OcJg
zFE?5M3>#>rEFp%l<!9Le@Q7)9GxoW&_Q|=^_QkpLS18U<2>l^6!_*D{caW^iSSV^S
z(&Cdjh^#*q_ed-alX8GG4O|Z~*o>&(ya^5yHiAZh@*{e*f4aJf9b53axjWIbyJMTN
zFE2Zd!{`QZQUizy0E!yHzNNki$6f$RT%9RAB=ea4P;<S|AmJ8js%t=vo)olLjAnG&
zp60rwvmB1T@Y2Q0XG#~&&7He&@_cAT$?m!=Q+ZM?N0=!E@B<|Q>}9NI0B|CitQjGt
z(IN3a9DAv;K)~`m;a0XKzzkJ$VE!S=K1?~#yEqaEp7mf6;vyAj1i&6atgt&w#)v*b
zSZ68QqrgWHCRy|bD@wEC;uHzu#Y~(y=aL>K34A|K6OhDkN$8WCs9DSn9#)hqjtwSo
z)#3<tQ};NB3Br#X#Z-MGmO{}EF|N1(D{YJEc&E*^>H|s??o|#)FD`qIv`3)<B~U#&
zN_>Er2kqM7fc`W3>wO1-VH!}h8&H?P9;T($yfy*l$-cX$0R;+hVV2S1IAR%fAMz}u
zpcF!2!n<i*`%fbe#0WcD26<4p+|`W1IF*ai^9E7djgmH0GZScQTGQUx|KoHzt5NGj
zb2t9>e~{s}O;X*Uq@~NRqN5d3VRU~aCXvD76dvz8;B*2gGsc%mQf5N+p9H>y<^o6)
zG2ly5GWPMM<tJ85AWp)UiIpVd%T$osvOWY~J~edHsw=6jGzs4Csjt7+Nu%D#x75xE
zP-ZHCdUOToa*q;>fXmLWooJ_f=}?LS^<{vOG7&Z!$RRwKxdPN6a-cBU8AUA5v8<o<
zbA*A!m>4G!8A5&@LNRcrF#PeAT{rcc>ed9IM_|winDO!MjB>5)N34~N&=%`{ei-*<
zcPGcRavzaa4z#EaDUW{`<j|wZfcgWy5>Qei9^cP^I#bJavhBT?&#C?jU@pvfPP%Vk
zES{s?JV$w~1e`7CPZ*v26@}NTOJRx-`cR26tPhYr_1}A*W@>9%Oasef#dusQdQwRd
zdfpQxg0Ubk_5wYRf<_LQcrP$<ImW~fb<sA^+Vks5N`YJw(e-JTcA)$25Z#ABH#<Q0
zogL`<MAzRJWCD#4UoaX#sS>0&El35il)xurANCPRUXMUNd%(#EO#flpfe5jJK|UA{
z(h;)W0VDrl@T{zz5NUv>hud#|nDT{?0^eBM?lQgrW^8ZSZPlPqsCkg@p{pT%GT)3I
zU#zE2Uy8oCiJn^KYy?WX7}d51QL0=8|L8+gS)<Pe1ug|m3M7KLLbm-mT#`0!M6hgb
zPe|w(pi+Q>A7~A>8oOFwDA&r+CeSV3a9YxZ@rSevP=LfZlu?G)K2EgaT44B*tH@Hh
zNfn_2c4UK-$~Dx4i3=B_*M9YdRvGs*6^3rZxe8l)RIlHGg(|QDNC4DGx8c^I$*M(`
z2B@@%h`k6cdQbI?yB$}X*PeRqG5ZvBp=68Va+lm!mT?WWU;fhPT7WI*1T-zbNFc$!
zgyvh(e4lN><AveJJlRsBn)VrJYAW1``Q>WO_1xz2#;kqPW`g!SX~;Y6E6?$S0d4gL
z4>&=?2I&f>4M@#}X9@SeJBOr5@?3H|u)9fh2hI?}t+`7ZKtxVWc<o~mY5O!UPSxMw
zE<5C;6YUa$GjotI7F>7LzItrt$@Y~O%S@FT06|Q`Prs5(lUK}xQi`XTP43a!e)QfP
zYkz8oxsheihLVcbmVg<Ll{Ow}uzRc4J`s;yR6PJH8av8&XP7H5SC^KLV029ov{#_A
zgU3w|$hM!2>mRJv3j;g3O9AM4pdS@?seu*3Rvg^v?W1$Rq(`D@q#jg&cReWI;_bov
zk5uPa%9g~C+Rx2PsPHg8pk>epuI?=Z)#nN{X3v2xkIoVe8`2@VLZ^>*J9{<A&(UFh
z20_szogmbNK=q1jQLu-Cy_A>g`M5+DFZD~I+KD8b)5yO(V1~Me@4|lhD^&Ll3cgH1
z59-P{>637ue2apwP{6?Umk=WZ9Bs&A5<A|*(Sz>;%otG40oz7`ADg5jJz~n2<%UPb
z>@Zb_vl)Mqrx*GBgd@#XqvA)WJ+j1+GO;+u1S3a_m?9axBV2uhVsBIMYY5=Y1!h^o
zN;)3n;=(TZ>y-I21<zA(j*9JZo3x1mtk)p*oPs<ct8gQ6r6(CAlPO951_kd@z=Qt+
zVsInuo;NmXFmU#GHTuB1sMQ<MDzRD6qAr~xIiWB4F<edc;OarFp!Z$ShfQP%oaPOv
zDzloZ0g##ilxcv;CkTArBjx22S~r#f<eUJk)b8c+WROnawV#rulKO6_K+St5K&Xa1
zg1wI-Mv#`ED^l-j`u!=pp)MtxqXEcj-rIi{*2TY)<d0GzRYUhjK$U<iAgbbRVLUhB
zSFoyi)(t2}VOoP~QouIyrh46gEe7zEY%#3nvN#1wp<YVrI+Y>^vwm9ox{i9ufs_$W
zu?A8y*A=8`u)Fl@jfynXpt=}=^}ogT`-5+5*EP_}43x@o3cyaU4|#am5S&W`&aD8h
zWT8CEA}t@M`Q5a&^MMrr2nQp89-~WUXUx|`4wRHg9SQ)Eg))lhd|%`{<38+FpfR?p
z$St_b8G(tfarguzQEFGLtQ#kT<F>qsejceh2k896ePws|%WlsjwPyhOg{k&G^+8G`
zmh-J08Hvcc+Cv`Er8;Ejj4H%FEa1ZM-y?m%*}kp(zP!H0d?@RIM{%gs*^@(`=ewV=
zj*drir$9{*V?JziM<{AvjE{Qc>2r>ldAZh}jrAUps)1D*q$cnxkjMQ8Dl!<M4e57C
zH$>-v_W%3Dm09_;mn`YD&OVuu;khf6P4>N8f#36{wuF-aKO4AJ-W*a+6?91^G5IP5
zWRtmNvR@c8Pxkf;bIc4$xe~L&uTfcYTeubJ=H!<st}>t7rp)?n9Lx7fvyoLau!l5Q
z1sI8*`AK_!pKT5(=XTa;4USlDQDeH?hgZMX@+N(Ln*x%&xAo$i=!N_Pq~AA><Tb?e
zNEREO$K5=SP}L94;|+!94}4|&{K-E=#aoGOb0)uq)aX8;K0${`>W`f!=-a~+1l~v!
zq-8$1udCf9=Lz6u`-8^hKTi$xl%uRk{4UWZ+jmwWw_M|+)0KaGKh^L3pOY!c1{U@3
zyA+G3z@zvM-TZSRc#AR-p}G2fic|1W=ipJQI6Mb`L&X2&qtcS!yPwkEb;tc+3-cw^
z@+J`QPBLjHk>BKi8n^(x3G%y2?F740M_nH7_dlXFHY`s-^?ijsNaZi1X8SZ$v-j&}
zImJG7=+NBB3uo*L=Pq4>p4PtjqCI~R(Pv3}I(O>gh57SmE}ucE_O2ahgz8-Tc$fN5
zPYW&_kDy~bE;uZ=>V3r^pb+t=J_?fGpikZid>|O~Fv`hxAq%^C9V_OBzsuVb7K+0)
zj&Cb)e0z=K3&j-i(n#kxT=^5*Du1H9jk+g)k2+3EgyHu4&l`@nG+{ip2g~0@>^iO9
zc|@#HU>uygac$h3NQ@`OjfsQ-Um>KZ;|6&Rk!nwS1B?;pb1yJchS8C6&ls8hZ^JU~
zWPqRyW8yBnR*3S@ks!uu;W>rJdlqeye-#^mp|raRjt^t6>Kj`Lcw53SL8jwCgV8Jr
z&l7#i3=C4<t-+)y6a#;odhGM@@j(T{5_a>F2Gd5OV}e2w$VL+ismVa;De_@?&Fom6
zR43gT>13c*OM-HWD03{$3PPBhfjRV5#rq=u$4I&xhfxkVW=eq~uM#9s4~Ahf?6JRa
zEik(&YfqyVp%(IU`Lal|nb~4|WRb!IDoBg+@lLWClFI#-NbOh-W=g1|(qx<#BcPE(
zUEnUH!KbB=nTs?R2QjD7u_NI{exCx&u+sk2rOOxRXS+s>*kyxuYtK*uy9ES#X#J&g
z1pmDQPFGITK0af&zwkVK7a$Zu<RV`!ueG$~;J;vy_5gMP<^^m8>A9BoeK&fNB6&!V
zXvX-V=M@5@%}d+*b+rmh3ORMu;m+YhvW1JufN=xb6=^&5CJB=8276$iAHPWM(mne{
zb&*;=0NJ!VGS5DKxP6diOLiq;t471MYi?r+nwtnMdA39UH^G}Jq}Z4-Ao?AmnSB!h
zIdznw>|w)2->IXJkK`#5i=rd!m0%huT8xZJVgdh$#$AVtS#PKeQ&?D{$tsTbkVZdk
zEH(<2XjV#uI*KN!l&YebiR8$rKyid|PxKM0rDCc_teAvTNC_<zbL(Z<fHYZ(9YTwf
z{QqrA+mg4j`7lNGdC@~+7bh@2(%h5FH<-Y^eII0CKY+U>($ebVc*vJTdxJzRa_1+p
zts67=f85Bw014bMFVJhhB)5^`WOIXL-)oTXXE<`4{GB5=>0dyB&{(Ru3sj6eWoF2i
zZvnnPGt~zEUk^LsMN35sRGdJMd;T;tu}2^~@X1Bw%ip0wFfBXefK*JtT-4l<3zS3u
z|A*0J1eA``=ff1(6cDFm-%0k)VJ8^2OR&txY>$UFFoC_xGPItlyP{QdPSD|G#~TZ1
iho@&C<O$QPackO|uy$D~{OwDptTC%#y=s**m;M(+;a0N%

literal 0
HcmV?d00001

diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b5e9ece0546b1ca84d2d20afbef247087b12f68
GIT binary patch
literal 10186
zcmeHN&2t+^cAsesJ_&*pN%2FXrO^i^LIO!q^4hZ3-mEOe&Q?~AvvR^1QHnJ{H-I4r
zGf>YADKdjfs<g=|s&(a>O%<7{9CF!zB-uY8lVftxHAh!vo!{#jfFLNVTyd&W*)quK
z>3-e)-s|qy-M{w$Z*sDv;q#CG88`m$9ZmapDvW+IDEt_Y|F=lE#&t)l>*~$w8TB^m
zhI(h~S@q7<bLyS1=heGVFW}9bVz*Q;>BJ-BOmxfjvdSCIWVceUsJ!k>)u(ut=YFo$
zr+J<i@IJ@S@d;l3xn7^)le_}TET7`jc+YYDiB_F?1--OSt9m@$lY(!zoNX!E+pg6W
zpk<$hUO<_AC>^gS+@?Xz(@zoa9G-`Gd`;K1D8uz#Ci9&24L65U2Bk@E^bMYE>-}t`
z^XzU$z8z^hTEuvc=i7`Icrns#otIu3{alpe6H6K|<6*moyqA_Isf;JTr1i7zUGybB
zE#ku~5&JuhPyKy{W6a=pj?cWz^mEW5&$Io)35N*1`E<hRNI!-KltRPSNt+WXd^XBP
z#*((D%bwy>c%mf~KD(qHF+Ru7AG4#(vC<53P+y%t{NRs#VLtkUpO}`*O-lxL(-CH1
z2afQ~z%!qPLLQh8rRV!nw9J+#O)CsKcsaHo#MzcD{Xk+Wwb*bhKX~;uKCu>Oz%tGy
zO<$?%4EOA+K`R7~an2WxXo7)bckLj~wQMI4GR}8}?^|tgI2m|><*Z5Jhfd(fSvwG2
z|1hr<d;jWJ`qenA@~_C6s~I_om#k1SMY?3$_MUKKef=xlB%%AB+yxiM+b1DX^c<@x
z;;i3u?4VkVSyRd~b)G{~FD7b#=J|2P=Fj7NQugH%)n=M*5F2D~|D?$)pnUa%4a}kR
zH~hf%y`7CmUNh_pH}E&O*xSH#TJA@7V<QY~$KRmQ+rnS(9mJEw!~a-%-9N$9(g6Hh
zND3^c7j#3<u>!lGPwR6!`^$p942q#&XLEWvgIsp`HwKf}(7Z{LJNVG%E#OfG{t+^P
z7GVo7V9D`hUa((O`g))ru}DAG`)o_&hM^hSi;NEAdZKM={)K&A$Ij5TFYpkpnmybr
znfTDshwiTH?Ym}MxLnAZiB)sVeLLuw4GU%@tw8vy@qrhDj!IYT32APaP0PI*m@;(D
zpkw>yYS6I)>Ml%_0?#uWcAM<Y?^se)6_3MdZUxpxB5Ruj92)DrT_<L;@o6;^8??XU
zT+eRqIs(?`Ssc`S$Lk7i%UEy46C-`qQYFb%OiVo07QyyNeGNpPY+x=|)=OU-nd$#B
zGV(SxEqrD<R$#kcoNqavw7J!!Q)KX2Lxwsglb*+^q8(w<F~M4(bA|=TEYUJxZ?@RH
z_5u?OdKQ$NqGOkj&<m;_`;-^KA%~VFqO>(W@eg`G9~e8?D9_6;^?sqP^^15)IIkvx
zT%^Cy`(<0}PvWTr`JH^kb_zj}(ib|Pe90bZUuOPntv^L4&`u$mTGDn(M-#_%V*Kf|
z$9!r?EvM8DpYBh)rC{=;hNBncOh?nvpjAIA^K&@o&bdZZj&NS}&kfs`cm?MdM&6$}
z%ttdj6_uae(%fQVW}r<av@f(_er~M)4Emo(|MO}8`H}wT)Bf}5Kgcfx(?{omnP_%r
zme10;_mUmW9YgP9{pfr!b2QKK7c3kt@&&w?b}k%e@TVVL<coYBXXHh$)4M;L@|pXN
zeCAR<^WTxre9C9xJMvkGW{;MSwWGIirzq<jly=#dxILn|K{>J7K`E(y_V`q9*zl?P
zU!3X<+df^3-tz;>Yh?c_S{SK?-3&@f_OHfzkIVk`sovwVe~sRA(QJ!FlY1J*`ESu=
zQigRcx>d^arC&1K`%7w<E%6KdBJSt%OV+;-T#8WQ7hYug7o&@O89VnPe@ms8RC;+y
zYa9K=XmP;x11(yJ=2K}*&Va7UkhTnI$bX}>w;=7UZy{|tS{`tPwArYFaWVNxREcoo
zKV|(({mX78xEy6rckV?dp+uLW%g_pK&}TmFW1&2+`k4JIX`lJD4=9ltT~U4JKhOep
zWJXim1kX7-w~)KUuV4-?@vF&67A?|_M!PG>W%#wf0unmLj%w$y$F72M9ee9)RM@$u
z=J~2R&lq3fH!#zMA^-VkA?0r>{^pSX@{s@X2>+X@9;!9<JSFeiP~NrE@}kScExJPK
zoYudNGM(3h^2)FtRMg)X)+gng(e-F0x)I%M8*0SE(y@L~dusHr_gDHi`ZuHE(Y2G7
z$2ifGS&GzH!1w0(d<|<>N9J>6yy#OM@1x46+uVvukbE;*jjGYDwn387D#KTCyWB<&
z<Bb+^=B?l?xXlf<zSpBG(ejX%NolCXyxgYQ!TQ~fa?$Mt4G4IGXYihQ_BdcWH=-gp
zc=aXYx8_DhuWE2ljnvfotNm*K)?iMeTB2c;YZU=HZXez(DG=~}&yrR*#hm{82~CP1
zlx|Xxi@16Z=M@eeKvO`974<JCP^r-A;dw%*XBxaB#Po_0i|u`3a*t3-(+k}II7RrP
zW;Q|)I-<Et1e072yBXM?>nG?1kVUwDD1`~gBTWVAOm1`2^#ap(>>j#FFKl<NRM{(n
zX7UaR`=R4`J=+E3NotecfPq57XAypFC<$#6l%fE-1Z=J5;)#cL07T-szb_8zxd#f6
z*K?mJR2J)7vHqZ*r_w_!AYZ8IIbqlJ$v;xCS`uiEjRRpx`#%6!?0-fg@c=kme$%$&
zoZWUk05Ltz2`N44OVa(2Jza~7pGmk_%|P%f1EQu8!wsO9`~Yp_50TUhZfASH1JGUv
zL)&ue1)#p3<=RbE)d2k7J<P87wG}_kCF6?=R<jvmfCrzxB6#(ARj-#5HMVWQ&F6>4
ziofDlncN^TG>uTtu%6rgD^7)*av85U+mkjBI&*t*_E5laiCK@w#Ye!BAH$SHg6WLl
z8Sln<z;uG+amE3Z%=duev9htTD-NJn%N7nN9{}tolz}NX(~nDJOKE-*Cef1~*38Fx
z#*0nQ1>y~U3Ml;iNl!Fw%XtV)3pVhRFq>}E<08&+q29$nTCNXtDdJ+++7+q@fGf~E
zY`{x`df+<I3*BZAswI~fY4{h^P^r2n$$aBU%;L7yv$qrfDr-q;9T1mp9sUkrbq`Xl
z9w0N1vtFp2tri3a7`V}ek5XNemx=Q#l6t;jHwc}_rDSooF(Y-O<zXgrT><}($5YMJ
z2DiHw)Oar6rjCUk1j%jQk{?phAcK{ch%fmz^-3#r95r*>1n%ooC)1h`vZ}vCWI~Hs
zyAiXNRWG!xZDn98mslKgKzD|8<P^Q0M_ZB^XVq4$XHn*@dd5SNM=PqUXOTg<h`MBW
z7#gzmBIpAGWfeE2Cv~Mv%(~r}?c?e0<Bi9nH|UmCFDGM9rXyxsTjM@Qg?tVbwcw)b
z`S&qM|0^V=+@ih+r>_K;ufWQ>!QkQ{NAG2F0P*DXIaX#Ry};%e(<>;$O#$sBznodr
z+1Jx}mf4$fmyAo`c!`at6;{#9YzBOn@tiJ|P&W<D25*C1*JmJ!eO+MeH;WlVzph^$
zDNnO$xUSRs4Sfc=3;08OR)YJAKa8xTUx0Hv&!oign{)^dK6K7d%tm=re}fFyBaIwb
zxU%qY=8^Ab0zJyW3o#<&hylkUZiCw_8yH8~C`(=w;v=}L5OZO2l~ai#DTRw#8!?u&
zu86vKPHK2=q^^Xzj}Wtg&qdgU`b7B_Im6{o4=;Cupy$88kvgVp*mLVmuZv>vJJ=;3
z9u{tp>3z83A7;a#wRS(*3AvW&T8>bck^B+H)y9L*7eB5WRDp<9;#{lat(ucGDUxO)
zHl(n){9I*sa6nNU!{UCmAe+?2q@+c)83euJ0(N^3rx-XUhYVK=#Cm(o9OYB=`4>F?
zJd!!sUj-Ib#;PSx85Gr~Noz9r(A7a@wLV`W_`u<bMg@4hEYJyA5UM|BFLYppgl?pC
zeT<$>>1>P+pI7mrRUPG%PwB=Oy^zwgWAtK5&yCSbDLsEe=M()R5Zdd;kkHSLA)%ig
zLqb10hJ=203<>@07!vx~F(mY}V<<+2sF>QQ_1Cz|e<qr^-AsynnYL^C6o|1LpccB+
zOs{3)q9?3jb}ZktIojH&Lktk17^1l5;D84fv4DAS#hHHC>;SVMcB9TZfFOjgAOrC(
zg-<+yBx!aLE0K0^01+L}!R2St2Z0>~kPtUo17ir1f}$#Wh&Fl(Te)GkfzG5ENl67j
z4+36FZF6;V?Y-){`FlV~Cg7S+zzRssgN7cdxF?(gz&F>&cqCw(=h3AMNdsJ!=#DZ4
z9~EQLjH9FUNnr>-k)}i|5r}kovP-pKj>`_95a6pejDyjwQZsQc0^op6ZDcN5&<pr!
z-Tas$r8QF9YXp|<Vg^PKB}&Q7bD{#8lX8mJD0qby!oma%M`bn+%uo%Kt|$tWA*4oY
z<U=y1SK(H#VKr<-JJlF1%qOv@gUW0OnxiHF11&#zS+t;*Yp(vFW^UfWe1CKRCIHod
z2@2|xR;v^*7SEeb$f-NRxh+~(1yNMwIGIP1LM=xIpp1{eGSD8-ZfZ)^nn|?bpnNse
zm?v|8e<e|9RViKFP};2$M|7gk<E<4F3MUmcbL(;1VFScuEgpkJE%2<HpC@A>6hqNR
z6@kXc)pBFjtZD+|34?nsd;y?w8iMZ;n8PG++oCGMM?NN=7SdBz7sznbs)mcvwGPN1
zsF4WThOkjA5_}6pKVA3}$u_{crY4Z=Ml2PHZB$JgE{NrlM*)beoD&Mh0+zyR#q%0?
zyK4>GwInPL6$(VAta`%hIbvN+(-UA$m!CGl)S<x`V8a;RMq<Q}7l^h8O@}+;H7iM#
zXHD_j8zE_x7~;@GzQH1Kx+LE^Kq9;u403gI6V`e6HP-n?%lu|Dd+=aoL<p&t3QDbR
zOo4&@ZK0d;bkfcnW9@3@16b!6abvVcVqIHkHA+3z<h0F_Zf~$#Wi=lsfok)Q;j2mW
z5loSclw6F_Q%Cu+sdb*U#?uxtFv>(-Sfx_qwU+r47&YwA3w@<a*M2Ve8l92pJYYkS
znRYyUXRzNPwGK?2lkmUomO7~e+GunQVT18-ctcoSd`l1#+e9_{P#rhKEIoS%1a*wt
zsXv}rzqut$c-^~nl<!+oZQ53dKmv52J5rsP*j)}cJHqK<W7CW^1Uw}A;sY0oACQ@L
zynU+h!(PvmfhnGaIQ1I%BID$i#5PYa*Y!Gk{8%BnI@@}T!j|j;JjrKB>g>T5s1NXd
zVEq!9>bLR6eoA-!pW`b&#84dP?w!qh$-yLejl=ptxIZK`EqpB}L3;ODXaSDUWwM83
z67bk|ZEN?Wcg=i6Yi6VW-8<`dHrMamTfhI_+xOnRPlu`uJOFJEb(@Ku;K;MwHkF66
z01n9al{-S5&~eR&&YrD$+T8}e^#MNW|F7eX@8QO4;YMOA-?*@c9WR6r?ucz#^zD1^
zynA<Z^A1+${~Db<#lcN@=?(A%O$dFcW`s`$T}A;)+U9$N^4o$?5_`s-C3fjlyYn7l
zn;KxG$LZJ`gp%F%Yi#&`jA39!Cs>1S=x+&P$Ra6T`vw$qinU^U3Zo2Q%s8h}w%_9z
zg}(kD;+U$D#GQXe=!IfG^_+$8HC$dT$sMBYQsPiD1US;AqDM&&Nn8Yg!q0f%^2L>5
z_NNpmAB#-UPrKmVY!4gx|BC+p4J3xXSfOC)$)CZ-DFr6`Xo_zNYIsTdeMa>>0>WXp
zcc65~k4AvxEMNXE)e~F!2_>IVvPH=kNaA7wDm_OgA@(xDYmVJm$5&GLy-$XkR8o=z
z3>1jQQ@l=wlw6~Pem<f?oGOl{v>F*I{dh>KAEOjL=z2VK#E<C5JU)>MS^?p=!Cx66
TISR)iJbbxS*8ZY!zC8KgkCfP$

literal 0
HcmV?d00001

diff --git a/utils/queries.py b/utils/queries.py
index 6985371..a64c4e8 100644
--- a/utils/queries.py
+++ b/utils/queries.py
@@ -1,3 +1,15 @@
+import numpy as np
+import pandas as pd
+import pickle
+import tqdm as tqdm
+import preprocessing
+import re
+import string
+from ethnicolr import pred_fl_reg_name
+from urllib.parse import quote
+from urllib.request import urlopen
+import json
+
 def namesFromXref(cr, doi, title, authorPos):
     '''Use DOI and article titles to query Crossref for author list'''
     if authorPos == 'first':
@@ -29,35 +41,19 @@ def namesFromXref(cr, doi, title, authorPos):
     return name
 
 
-def gender_base(homedir):
-	"""
-	for unknown gender, fill with base rates
-	you will never / can't run this (that file is too big to share)
-	"""
-	main_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0)
-
-
-	gender_base = {}
-	for year in np.unique(main_df.PY.values):
-		ydf = main_df[main_df.PY==year].AG
-		fa = np.array([x[0] for x in ydf.values])
-		la = np.array([x[1] for x in ydf.values])
-
-		fa_m = len(fa[fa=='M'])/ len(fa[fa!='U'])
-		fa_w = len(fa[fa=='W'])/ len(fa[fa!='U'])
-
-		la_m = len(la[fa=='M'])/ len(la[la!='U'])
-		la_w = len(la[fa=='W'])/ len(la[la!='U'])
-
-		gender_base[year] = [fa_m,fa_w,la_m,la_w]
+def get_gender_base(homedir):
+    """
+    for unknown gender, fill with base rates
+    you will never / can't run this (that file is too big to share)
+    """
 
-	gender_base[2020] = [fa_m,fa_w,la_m,la_w]
+    with open(homedir + 'data/gender_base' + '.pkl', 'rb') as f:
+        gender_base = pickle.load(f)
 
-	with open(homedir + '/data/gender_base' + '.pkl', 'wb') as f:
-		pickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL)
+    return gender_base
 
 
-def get_pred_demos(authors):
+def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', method='florida'):
     """
 
     :param authors:
@@ -79,6 +75,11 @@ def get_pred_demos(authors):
     race = []
 
     idx = 0
+    # save base gender rates
+    gender_base = get_gender_base(homedir)
+    # make a dictionary of names so we don't query the same thing twice
+    full_name_data = {}
+    first_name_data = {}
     for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)):
         if 'author' not in bibfile.entries[paper].persons.keys():
             continue  # some editorials have no authors
@@ -106,6 +107,17 @@ def get_pred_demos(authors):
             la_fname = la.last_names[0]  # for people like Plato
         la_lname = la.last_names[0]
 
+
+        fa_fname = preprocessing.convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        fa_lname = preprocessing.convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        la_fname = preprocessing.convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        la_lname = preprocessing.convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate(
+            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+
+        # double check for self cites again
         if fa_fname.lower().strip() == authors[1].lower().strip():
             if fa_lname.lower().strip() == authors[0].lower().strip():
                 continue
@@ -122,48 +134,35 @@ def get_pred_demos(authors):
             if la_lname.lower().strip() == authors[2].lower().strip():
                 continue
 
-        fa_fname = convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate(
-            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
-        fa_lname = convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate(
-            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
-        la_fname = convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate(
-            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
-        la_lname = convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate(
-            str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
+        if (fa_lname, fa_fname) in full_name_data:
+            fa_race = full_name_data[(fa_lname, fa_fname)]
+        else:
+            names = [{'lname': fa_lname, 'fname': fa_fname}]
+            fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
+            odf = pred_fl_reg_name(fa_df, 'lname', 'fname')
+            fa_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']]
+            full_name_data[(fa_lname, fa_fname)] = fa_race
+
+        if (la_lname, la_fname) in full_name_data:
+            la_race = full_name_data[(la_lname, la_fname)]
+        else:
+            names = [{'lname': la_lname, 'fname': la_fname}]
+            la_df = pd.DataFrame(names, columns=['fname', 'lname'])
+            odf = pred_fl_reg_name(la_df, 'lname', 'fname')
+            la_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']]
+            full_name_data[(la_lname, la_fname)] = la_race
+
+        if fa_fname in first_name_data:
+            fa_gender, fa_g = first_name_data[fa_fname]
+        else:
+            fa_gender, fa_g = gen_api_query(gender_key, fa_fname, gb)
+            first_name_data[fa_fname] = (fa_gender, fa_g)
 
-        names = [{'lname': fa_lname, 'fname': fa_fname}]
-        fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
-        asian, hispanic, black, white = pred_fl_reg_name(fa_df, 'lname', 'fname').values[0][-4:]
-        fa_race = [white, asian, hispanic, black]
-
-        names = [{'lname': la_lname, 'fname': la_fname}]
-        la_df = pd.DataFrame(names, columns=['fname', 'lname'])
-        asian, hispanic, black, white = pred_fl_reg_name(la_df, 'lname', 'fname').values[0][-4:]
-        la_race = [white, asian, hispanic, black]
-
-        url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(fa_fname))
-        response = urlopen(url)
-        decoded = response.read().decode('utf-8')
-        fa_gender = json.loads(decoded)
-        if fa_gender['gender'] == 'female':
-            fa_g = [0, fa_gender['accuracy'] / 100.]
-        if fa_gender['gender'] == 'male':
-            fa_g = [fa_gender['accuracy'] / 100., 0]
-        if fa_gender['gender'] == 'unknown':
-            fa_g = gb[:2]
-
-        url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(la_fname))
-        response = urlopen(url)
-        decoded = response.read().decode('utf-8')
-        la_gender = json.loads(decoded)
-        if la_gender['gender'] == 'female':
-            la_g = [0, la_gender['accuracy'] / 100.]
-
-        if la_gender['gender'] == 'male':
-            la_g = [la_gender['accuracy'] / 100., 0]
-
-        if la_gender['gender'] == 'unknown':
-            la_g = gb[2:]
+        if la_fname in first_name_data:
+            la_gender, la_g = first_name_data[la_fname]
+        else:
+            la_gender, la_g = gen_api_query(gender_key, la_fname, gb)
+            first_name_data[la_fname] = (la_gender, la_g)
 
         fa_data = np.array(
             [paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['accuracy']), fa_race[0],
@@ -181,7 +180,6 @@ def get_pred_demos(authors):
         mm, wm, mw, ww = [mm, wm, mw, ww] / np.sum([mm, wm, mw, ww])
 
         gender.append([mm, wm, mw, ww])
-
         ww = fa_race[0] * la_race[0]
         aw = np.sum(fa_race[1:]) * la_race[0]
         wa = fa_race[0] * np.sum(la_race[1:])
@@ -201,9 +199,22 @@ def get_pred_demos(authors):
     mm, wm, mw, ww = np.mean(gender, axis=0) * 100
     WW, aw, wa, aa = np.mean(race, axis=0) * 100
 
-    return mm, wm, mw, ww, WW, aw, wa,aa
-
-def print_statements(mm, wm, mw, ww, WW, aw, wa,aa):
+    return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix
+
+def gen_api_query(gender_key, name, gb):
+    url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(name))
+    response = urlopen(url)
+    decoded = response.read().decode('utf-8')
+    gender = json.loads(decoded)
+    if gender['gender'] == 'female':
+        g = [0, gender['accuracy'] / 100.]
+    if gender['gender'] == 'male':
+        g = [gender['accuracy'] / 100., 0]
+    if gender['gender'] == 'unknown':
+        g = gb[:2]
+    return gender, g
+
+def print_statements(mm, wm, mw, ww, WW, aw, wa, aa):
     statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \
     are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \
     diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \

From 0cc0df3dc0ed05f32ac0855a8d43ecbad044b396 Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 5 Aug 2022 11:02:14 -0400
Subject: [PATCH 23/47] added check for names that have already been queried

---
 utils/__pycache__/queries.cpython-310.pyc | Bin 10186 -> 10461 bytes
 utils/queries.py                          |  10 ++++++++++
 2 files changed, 10 insertions(+)

diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc
index 1b5e9ece0546b1ca84d2d20afbef247087b12f68..a4cf3c74c2e888a7aca0e059ba566383971bfefb 100644
GIT binary patch
delta 1849
zcmb7E&u`;I6rLHo`QbWF;^v2)q^X-UY11|>$33jXf!!No6>&hIMOEpfApz2D>a9fe
zMiDA2_OjASJ^~`<fH;BJUicf3I4nmbBrb>(9QH5ZjpI0Nw0j_yC-1%Q``&x=#`Z7w
z{^%Gdsgy?WSAOlE55B$ilW}*+*hg2#`whI$e)A5Oz8G^=r18$U@j)EeAmJwEdP#Ze
z3A;m{D(`nhmK!n#S(^;B$+Az*>FD6J!_!ez52GLRj4+3qo0=m$ZIeqXOfX@_`V4fA
zxx~s)g|QB%G$Vn=4V!QkW4Vhs<OW7y=ied2Bqj2~7&eS}{@uA<SkTLyMbsBVy>(4r
z4D}Z3i<!`|-OMEGnvnPYjlE9u-g*2UwWOW2Ld2Hdferad#V3<hp8(;CO=jLl>?s|&
zP?-6MVv{p(ik^0EsACJb2L<oXMETG|o;6dxq{=g}uc3=oo-L>st2_r=RvAA>UIFsQ
z3{vL=d;bekQy)iHa_Q8*q?8QX(&qM8tVc!JRR>E@<|SV4F|I#hqT*Je!ZC8Gh*ijN
zrAR~uR(ZuHxUL3kHCWed(o;nVO0v>emcC8E1}kiVjhYAqKDJQB76LnL0qs||u!b$H
z{f`!^P=#?=3R}p7ffHb(BQPLullx3KVjaupgVLtE1`2XkE|n<)>0pm(a9|x<M-Aq%
z!W@s#w}KosQO6wCwH#KM0})79!z4AR2T82k#AOo)vfRO91@8x~xLlHt5JuOb<UP>r
z@dhLY_22+DATEc`8@!1}kcgBP*vG6uFh`BA<IrooW)m5KYLp8r^ZFxZ6QA-;a6%rk
zC9fCr>7<5?4WEfdRIefHjpqenU8aM&TysP;(Z_co@;4(Kqlj-s_|$Jf6E<NBT0J$`
zUgY_-Ozx|qDK^EHXhCvPpELb2K8z5|WV#zn+?q~jzM}})o-zac+Ky4Pv#5CqXBRXq
z-GXh{f!&_!V`Z0(Cko&4sc55M1MpCMHfi8@pv~3b@HU_h)hMQfF+_1PZ8;wtX&YkD
z#y2De%fSP%eA;m-Z$bJ?!ndPuZ#+lP-}!n;K0S!<LI(Y#R}t8t2}!Q<HXhJk?%Dud
ziD);FJ+Uoz#O`dYus7W!#Ez41lh*##_j-f9d9l)Y*d6t|-05|PygPCZAM~9I!*Tnc
zbkCfVhpuyS>|7+A?$AZn*+qFq9~~Zb_q*<g!~Rj<{WOqW{c5~L>AqLVd|pir4=~}u
zX^{NvJ!5zv1vB6KCi9|ZQbM$tMJ<}38cX1%Q<bTBCs^=Ly&to~TRJtU&P=K!t%9q8
sw3`tPBO1-H1fp}>yKCN29L2eMY(A#+_p5B~JCpu>^>Z^#mAan#4~h`UrT_o{

delta 1418
zcma)5OOMk?5bka}lf>gVj_o+{B$LPFJz+Q^4rq~fS#cUh2qd)Hxqw;|6rt?QY%~W{
ztsDl#iHGDP2*n4G_JUaDPpo#~2P}yH0G!aCw%l<%86+fZb-U`TufFQ4PCi`yd&l@K
zlhFu1FFwD!b@kkz#?J|33tb)8Pq=>n(Z88^;D1bblP!_u2C!iYvK|>~)75|+(ZgR4
zws|&2^$5MqO<|6-apok#G3%HL6HHjJ1n4~H5s@3Ipo1A&F<;|`N4ScZbvFst(3nWa
zzClK5O2QYx>>T^$#CE<Cuuve60{NjKx$aVaKFW77e_<EK<}^39z#iJ%<_^z4WK$==
zg9rXj@@(3U!jAt#@*;KobIBcA2yLYRwr{3N%04*$*;Fa9f3S_n6+|8*atD#eh`5sT
z3JQ|*N+KcWEh5O7yhX$Tdyhfph&-W(?;$hyv5n%W7uwu=#s*Y)V|ADSkFW94fN}jH
z6KmrNpvTva6;Xl`FM9;{b-7pMUiHX86-6j6R4))HfE!g~ht>oG62Nk0EQj_t%dKO%
z_3tQGhVnuc%h_OHW9;xU7+`y3mx+p~j*M{?6pT%B$I2Xn3RLl2NWvsHO1g=@E0b!X
z9woUj&p`xgP?tIG1u|yS8f1A5MV&q760|D3j#H@c4UdEf6vH0`^6F?Qym6m-#Q#gn
zwKgC%Y{=<uKr);xZ}B$HHx+BTcw5i4!lWE4lotvC<(;TfEvy$-8mt6#x`B*lz(gZf
zZ=6<#YN!VFaCeAkqOUCZtvHS-#@kSxM1(oefhM$|4V{51n~J@FmdLIunxZAzq66t^
z<AfPd{2$3dfNTbZJEuny&^Xdv8d=!zM9wUrnAH0^N%~7uLCz*<Sh@pU=)vYd4ND_S
z;oX3WK3do@6azMG1qAwBm3P>LI+SBfi7<@fWcqty|2Wb<tU$j&#0so(1;48Y+haCt
zLz=6+hu631EcGuV+NsDH(G@*BUNkW{Ge07%?4_@fUVnBz(_*tfvR5eG^1qn({HM83
z>_6f^vMT;->rU68I<u%w%eV}->}lwkOsDg&f8D+`duBhN^v$f9e`(QwXa97vRLSd^
F{{hQ$R$%}D

diff --git a/utils/queries.py b/utils/queries.py
index a64c4e8..4f7ff41 100644
--- a/utils/queries.py
+++ b/utils/queries.py
@@ -80,6 +80,8 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
     # make a dictionary of names so we don't query the same thing twice
     full_name_data = {}
     first_name_data = {}
+    n_gen_queries = 0
+    n_race_queries = 0
     for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)):
         if 'author' not in bibfile.entries[paper].persons.keys():
             continue  # some editorials have no authors
@@ -140,6 +142,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
             names = [{'lname': fa_lname, 'fname': fa_fname}]
             fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
             odf = pred_fl_reg_name(fa_df, 'lname', 'fname')
+            n_race_queries = n_race_queries + 1
             fa_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']]
             full_name_data[(fa_lname, fa_fname)] = fa_race
 
@@ -149,6 +152,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
             names = [{'lname': la_lname, 'fname': la_fname}]
             la_df = pd.DataFrame(names, columns=['fname', 'lname'])
             odf = pred_fl_reg_name(la_df, 'lname', 'fname')
+            n_race_queries = n_race_queries + 1
             la_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']]
             full_name_data[(la_lname, la_fname)] = la_race
 
@@ -156,12 +160,14 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
             fa_gender, fa_g = first_name_data[fa_fname]
         else:
             fa_gender, fa_g = gen_api_query(gender_key, fa_fname, gb)
+            n_gen_queries = n_gen_queries + 1
             first_name_data[fa_fname] = (fa_gender, fa_g)
 
         if la_fname in first_name_data:
             la_gender, la_g = first_name_data[la_fname]
         else:
             la_gender, la_g = gen_api_query(gender_key, la_fname, gb)
+            n_gen_queries= n_gen_queries + 1
             first_name_data[la_fname] = (la_gender, la_g)
 
         fa_data = np.array(
@@ -196,6 +202,10 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
         citation_matrix = citation_matrix + paper_matrix
         idx = idx + 1
 
+    # report queries
+    print(f"Queried gender api {n_gen_queries} times out of {len(bibfile.entries)*2} entries")
+    print(f"Queried race/ethnicity api {n_race_queries} times out of {len(bibfile.entries)*2} entries")
+
     mm, wm, mw, ww = np.mean(gender, axis=0) * 100
     WW, aw, wa, aa = np.mean(race, axis=0) * 100
 

From 406665f3664f83dacbb7acbbb6d27b1bcf1d07d4 Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 5 Aug 2022 12:03:21 -0400
Subject: [PATCH 24/47] made histogram plotting in python

---
 cleanBib.ipynb                              | 103 +++++++-------------
 tests/aux/pipeline.py                       |  14 ++-
 tests/erroneous/pipeline.py                 |  14 ++-
 tests/erroneous/testBib_erroneous_clean.bib |   1 -
 utils/__pycache__/queries.cpython-310.pyc   | Bin 10461 -> 10413 bytes
 5 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index bcafb12..3df6c48 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -186,7 +186,6 @@
    "source": [
     "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n",
     "\n",
-    "# TODO: Remove in the PR that gets rid of argparse. \n",
     "# The following saves the api key to a txt file just to be reloaded by the next cell\n",
     "with open(\"genderAPIkey.txt\", 'w') as f:\n",
     "    f.write(genderAPI_key)\n",
@@ -223,14 +222,13 @@
    },
    "outputs": [],
    "source": [
-    "from ethnicolr import pred_fl_reg_name\n",
     "f = open(\"genderAPIkey.txt\", \"r\")\n",
     "genderAPI_key = f.readline().replace('\\n', '')\n",
     "\n",
     "import tensorflow as tf\n",
     "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
     "\n",
-    "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, gender_key)\n",
+    "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)\n",
     "statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)"
    ]
   },
@@ -312,74 +310,43 @@
    "outputs": [],
    "source": [
     "# Plot a histogram #\n",
-    "names <- read.csv('/home/jovyan/predictions.csv', header=T)\n",
-    "total_citations <- nrow(na.omit(names))/2\n",
-    "names$GendCat <- gsub(\"female\", \"W\", names$GendCat, fixed=T)\n",
-    "names$GendCat <- gsub(\"male\", \"M\", names$GendCat, fixed=T)\n",
-    "names$GendCat <- gsub(\"unknown\", \"U\", names$GendCat, fixed=T)\n",
-    "gend_cats <- unique(names$GendCat)  # get a vector of all the gender categories in your paper\n",
-    "\n",
-    "# Create an empty data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
-    "dat_for_plot <- data.frame(gender_category = NA,\n",
-    "                           number = NA,\n",
-    "                           percentage = NA)\n",
-    "\n",
-    "\n",
-    "### Loop through each gender category from your paper, calculate the citation percentage of each gender category, and save the gender category and its citation percentage in dat_for_plot data frame ###\n",
-    "if (length(names$GendCat) != 1) {\n",
-    "  \n",
-    "  for (i in 1:length(gend_cats)){\n",
-    "    \n",
-    "    # Create an empty temporary data frame that will be binded to the dat_for_plot data frame\n",
-    "    temp_df <- data.frame(gender_category = NA,\n",
-    "                          number = NA,\n",
-    "                          percentage = NA)\n",
-    "    \n",
-    "    # Get the gender category, the number of citations with that category, and calculate the percentage of citations with that category\n",
-    "    gend_cat <- gend_cats[i]\n",
-    "    number_gend_cat <- length(names$GendCat[names$GendCat == gend_cat])\n",
-    "    perc_gend_cat <- (number_gend_cat / total_citations) * 100\n",
-    "    \n",
-    "    # Bind this information to the original data frame\n",
-    "    temp_df$gender_category <- gend_cat\n",
-    "    temp_df$number <- number_gend_cat\n",
-    "    temp_df$percentage <- perc_gend_cat\n",
-    "    dat_for_plot <- rbind(dat_for_plot, temp_df)\n",
-    "    \n",
-    "  }\n",
-    "  \n",
-    "}\n",
-    "\n",
+    "names = pd.read_csv('/home/jovyan/predictions.csv')\n",
+    "total_citations = names.CitationKey.nunique()\n",
+    "names.GendCat = names.GendCat.str.replace('female', 'W', regex=False)\n",
+    "names.GendCat = names.GendCat.str.replace('male', 'M', regex=False)\n",
+    "names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False)\n",
+    "gend_cats = names['GendCat'].dropna().unique()  # get a vector of all the gender categories in your paper\n",
+    "\n",
+    "# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
+    "#dat_for_plot =\n",
+    "dat_for_plot = names.groupby('GendCat').size().reset_index()\n",
+    "dat_for_plot.rename(columns={0:'count'}, inplace=True)\n",
+    "dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)\n",
     "\n",
     "# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n",
-    "dat_for_baserate_plot <- subset(dat_for_plot, gender_category == 'WW' | gender_category == 'MW' | gender_category == 'WM' | gender_category == 'MM')\n",
-    "baserate <- c(6.7, 9.4, 25.5, 58.4)\n",
-    "dat_for_baserate_plot$baserate <- baserate[order(c(which(dat_for_baserate_plot$gender_category == 'WW'), which(dat_for_baserate_plot$gender_category == 'MW'), which(dat_for_baserate_plot$gender_category == 'WM'), which(dat_for_baserate_plot$gender_category == 'MM')))]\n",
-    "dat_for_baserate_plot$citation_rel_to_baserate <- dat_for_baserate_plot$percentage - dat_for_baserate_plot$baserate\n",
-    "\n",
-    "\n",
-    "# Plot the Histogram of Number of Papers per category against predicted gender category #\n",
-    "\n",
-    "library(ggplot2)\n",
-    "\n",
-    "dat_for_plot = dat_for_plot[-1:-2,]\n",
-    "\n",
-    "dat_for_plot$gender_category <- factor(dat_for_plot$gender_category, levels = dat_for_plot$gender_category)\n",
-    "ggplot(dat_for_plot, aes(x = gender_category, y = number, fill = gender_category)) +\n",
-    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) + \n",
-    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM', 'UW', 'UM', 'WU', 'MU', 'UU')) +\n",
-    "  geom_text(aes(label = number), vjust = -0.3, color = 'black', size = 2.5) +\n",
-    "  theme(legend.position = 'right') + theme_minimal() +\n",
-    "  xlab('Predicted gender category') + ylab('Number of papers') + ggtitle(\"\") + theme_classic(base_size=15)\n",
-    "\n",
+    "dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'WW') |\n",
+    "                                         (dat_for_plot.GendCat == 'MW') |\n",
+    "                                         (dat_for_plot.GendCat == 'WM') |\n",
+    "                                         (dat_for_plot.GendCat == 'MM'),:]\n",
+    "# MM,MW,WM,WW\n",
+    "baserate = [58.4, 9.4, 25.5, 6.7]\n",
+    "dat_for_baserate_plot['baserate'] = baserate\n",
+    "dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate=\n",
+    "                                                     dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate\n",
+    "                                                     )\n",
+    "\n",
+    "# plot\n",
+    "plt.figure()\n",
+    "sns.barplot(data=dat_for_plot, x='GendCat', y='count', order=np.flip(gend_cats))\n",
+    "plt.xlabel('Predicted gender category')\n",
+    "plt.ylabel('Number of papers')\n",
+    "plt.tight_layout()\n",
     "\n",
-    "# Plot the Histogram of % citations relative to benchmarks against predicted gender category\n",
-    "ggplot(dat_for_baserate_plot, aes(x = gender_category, y = citation_rel_to_baserate, fill = gender_category)) +\n",
-    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) +\n",
-    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM')) +\n",
-    "  geom_text(aes(label = round(citation_rel_to_baserate, digits = 2)), vjust = -0.3, color = 'black', size = 2.5) +\n",
-    "  theme(legend.position = 'right') + theme_minimal() +\n",
-    "  xlab('Predicted gender category') + ylab('% of citations relative to benchmarks') + ggtitle(\"\") + theme_classic(base_size=15)"
+    "plt.figure()\n",
+    "sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='citation_rel_to_baserate', order=['WW','WM','MW','MM'])\n",
+    "plt.xlabel('Predicted gender category')\n",
+    "plt.ylabel('% of citations relative to benchmarks')\n",
+    "plt.tight_layout()"
    ]
   },
   {
diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py
index c524166..37884f0 100644
--- a/tests/aux/pipeline.py
+++ b/tests/aux/pipeline.py
@@ -35,4 +35,16 @@
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
-bib_check(homedir)
\ No newline at end of file
+bib_check(homedir)
+
+# queries
+try:
+    f = open("genderAPIkey.txt", "r")
+    genderAPI_key = f.readline().replace('\n', '')
+except:
+    genderAPI_key = input("Enter genderAPI key:")
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)
+statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)
+print(statement)
\ No newline at end of file
diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py
index c524166..37884f0 100644
--- a/tests/erroneous/pipeline.py
+++ b/tests/erroneous/pipeline.py
@@ -35,4 +35,16 @@
     bib_data = get_duplicates(bib_data, bib_files[0])
     # get names, remove CDS, find self cites
     get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)
-bib_check(homedir)
\ No newline at end of file
+bib_check(homedir)
+
+# queries
+try:
+    f = open("genderAPIkey.txt", "r")
+    genderAPI_key = f.readline().replace('\n', '')
+except:
+    genderAPI_key = input("Enter genderAPI key:")
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)
+statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)
+print(statement)
\ No newline at end of file
diff --git a/tests/erroneous/testBib_erroneous_clean.bib b/tests/erroneous/testBib_erroneous_clean.bib
index dc0ece8..db73a74 100644
--- a/tests/erroneous/testBib_erroneous_clean.bib
+++ b/tests/erroneous/testBib_erroneous_clean.bib
@@ -234,4 +234,3 @@ @article{zurn2020network
  volume = {375},
  year = {2020}
 }
-
diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc
index a4cf3c74c2e888a7aca0e059ba566383971bfefb..eb47eaa92184637f15d4825f0927a31059d9e014 100644
GIT binary patch
delta 186
zcmcZ`xHgbCpO=@50SK-byiF3@$Q#VcxN34D>k3BO&0K7p%)Iw&cx%LJMAMig8ET~_
z>u_fBN&!VAYb3xTa+B9_rt${Y@YRS)GQ=>|D%2`Y*5K0TEd+{4fJKyQl{e>bDR3}y
zP3{%y*XCp7VG?5GVdP+x0g`-7sC+IUUx0}NC<A0|_7}dw$arnDis&*S#>bOSs~0my
NO}5vV%`DHu3jiapD4qZS

delta 234
zcmZ1*csGzYpO=@50SKfG-X<;D$Q#VccxiGX>k7ue&0K7p%#u%Qcx%LJMAMig8EU0U
z7$NLtMn;Co3Y?jeYCu`Z8VRth98ea_2Fp(4OchP8;j0mqWQbv^Rj5@2%8G(zIJoph
zTY)kXU>T)aWuS}%NM^GOmjVZ)(BxF1eq$a+K1LoUAx0iX4n`Ru$;X5)!UYuJViaKF
m0Lp^do8^VCFf!iS{6%D$5aaX7v(<|kZ%yXcn9Z!r!wUe(DKV!2


From 7c9fe8b453121f9f0368eaa434cad98342d03ef0 Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 5 Aug 2022 12:16:48 -0400
Subject: [PATCH 25/47] fixed bug in histogram

---
 cleanBib.ipynb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 3df6c48..10b697f 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -318,7 +318,6 @@
     "gend_cats = names['GendCat'].dropna().unique()  # get a vector of all the gender categories in your paper\n",
     "\n",
     "# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
-    "#dat_for_plot =\n",
     "dat_for_plot = names.groupby('GendCat').size().reset_index()\n",
     "dat_for_plot.rename(columns={0:'count'}, inplace=True)\n",
     "dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)\n",

From a9ab4b1b6fe7b5eb4e50c0af2b6772d2d4b568ae Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 12 Aug 2022 16:46:02 -0400
Subject: [PATCH 26/47] added env file

---
 tests/immaculate/env_js.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 tests/immaculate/env_js.yml

diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml
new file mode 100644
index 0000000..6c49a40
--- /dev/null
+++ b/tests/immaculate/env_js.yml
@@ -0,0 +1,15 @@
+name: cleanBib
+channels:
+  - defaults
+dependencies:
+  - pip
+  - python
+  - habanero
+  - pylatexenc
+  - pybtex
+  - bibtexparser
+  - numpy
+  - tensorflow=2.8
+  - ipykernel
+  - seaborn
+prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib

From 694882c9c7830463e29b9af71651cda14ddb00e4 Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 12 Aug 2022 16:47:07 -0400
Subject: [PATCH 27/47] fixed file paths

---
 cleanBib.ipynb | 51 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 10b697f..1133d54 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -43,11 +43,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No optional .tex file found.\n"
+     ]
+    }
+   ],
    "source": [
     "import glob\n",
     "from habanero import Crossref\n",
@@ -55,7 +63,7 @@
     "import os\n",
     "from pathlib import Path\n",
     "wd = Path(os.getcwd())\n",
-    "sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')\n",
+    "sys.path.insert(1, f'{wd.absolute()}/utils')\n",
     "from preprocessing import *\n",
     "from ethnicolr import pred_fl_reg_name\n",
     "import tensorflow as tf\n",
@@ -146,6 +154,9 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
    "source": [
     "## 3. Estimate gender and race of authors from cleaned bibliography\n",
     "\n",
@@ -171,10 +182,7 @@
     "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n",
     "\n",
     "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
@@ -400,17 +408,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "SoS",
-   "language": "sos",
-   "name": "sos"
+   "display_name": "Python 3.10.5 ('cleanBib')",
+   "language": "python",
+   "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": "sos",
-   "file_extension": ".sos",
-   "mimetype": "text/x-sos",
-   "name": "sos",
-   "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter",
-   "pygments_lexer": "sos"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
   },
   "sos": {
    "kernels": [
@@ -437,8 +449,13 @@
     "height": 0
    },
    "version": "0.20.1"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "66f30d3a05dff018f3baf45891c3cf21b32f9380ea78dc5d1d8b601d704d86ef"
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}

From 3753a422470c0d038ceac1c327c14b9e8075353d Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 12 Aug 2022 16:59:53 -0400
Subject: [PATCH 28/47] new environment

---
 env_js.yml | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 env_js.yml

diff --git a/env_js.yml b/env_js.yml
new file mode 100644
index 0000000..6b2fdd3
--- /dev/null
+++ b/env_js.yml
@@ -0,0 +1,173 @@
+name: cleanBib
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - abseil-cpp=20210324.2=he49afe7_0
+  - absl-py=1.1.0=pyhd8ed1ab_0
+  - aiohttp=3.8.1=py310h1961e1f_1
+  - aiosignal=1.2.0=pyhd8ed1ab_0
+  - appnope=0.1.2=py310hecd8cb5_1001
+  - asttokens=2.0.5=pyhd3eb1b0_0
+  - astunparse=1.6.3=pyhd8ed1ab_0
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - bibtexparser=1.3.0=pyhd8ed1ab_0
+  - blas=1.0=mkl
+  - blinker=1.4=py_1
+  - bottleneck=1.3.5=py310h4e76f89_0
+  - brotli=1.0.9=hca72f7f_7
+  - brotli-bin=1.0.9=hca72f7f_7
+  - brotlipy=0.7.0=py310h1961e1f_1004
+  - bzip2=1.0.8=h0d85af4_4
+  - c-ares=1.18.1=h0d85af4_0
+  - ca-certificates=2022.07.19=hecd8cb5_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.6.15=py310hecd8cb5_0
+  - cffi=1.15.1=py310h96bbf6e_0
+  - charset-normalizer=2.1.0=pyhd8ed1ab_0
+  - click=8.1.3=py310h2ec42d9_0
+  - colorama=0.4.5=pyhd8ed1ab_0
+  - cryptography=37.0.1=py310hf6deb26_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - debugpy=1.5.1=py310he9d5cce_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - entrypoints=0.4=py310hecd8cb5_0
+  - executing=0.8.3=pyhd3eb1b0_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.11.0=hd8bbffd_0
+  - frozenlist=1.3.0=py310h1961e1f_1
+  - gast=0.5.3=pyhd8ed1ab_0
+  - giflib=5.2.1=hbcb3906_2
+  - google-auth=2.9.1=pyh6c4a22f_0
+  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
+  - google-pasta=0.2.0=pyh8c360ce_0
+  - grpc-cpp=1.45.2=h360b188_4
+  - grpcio=1.45.0=py310h1da61bb_0
+  - h5py=3.6.0=py310h6c517f8_0
+  - habanero=1.2.2=pyh6c4a22f_0
+  - hdf5=1.10.6=hdbbcd12_0
+  - icu=70.1=h96cf925_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib-metadata=4.11.4=py310h2ec42d9_0
+  - intel-openmp=2021.4.0=hecd8cb5_3538
+  - ipykernel=6.9.1=py310hecd8cb5_0
+  - ipython=8.4.0=py310hecd8cb5_0
+  - jedi=0.18.1=py310hecd8cb5_1
+  - jpeg=9e=hac89ed1_2
+  - jupyter_client=7.2.2=py310hecd8cb5_0
+  - jupyter_core=4.10.0=py310hecd8cb5_0
+  - keras=2.8.0=pyhd8ed1ab_0
+  - keras-preprocessing=1.1.2=pyhd8ed1ab_0
+  - kiwisolver=1.4.2=py310he9d5cce_0
+  - krb5=1.19.3=hb49756b_0
+  - latexcodec=2.0.1=pyh9f0ad1d_0
+  - lcms2=2.12=hf1fd2bf_0
+  - libbrotlicommon=1.0.9=hca72f7f_7
+  - libbrotlidec=1.0.9=hca72f7f_7
+  - libbrotlienc=1.0.9=hca72f7f_7
+  - libcurl=7.83.1=h372c54d_0
+  - libcxx=14.0.6=hce7ea42_0
+  - libedit=3.1.20191231=h0678c8f_2
+  - libev=4.33=haf1e3a3_1
+  - libffi=3.4.2=h0d85af4_5
+  - libgfortran=3.0.1=0
+  - libnghttp2=1.47.0=h942079c_0
+  - libpng=1.6.37=h5a3d3bf_3
+  - libprotobuf=3.20.1=h2292cb8_0
+  - libsodium=1.0.18=h1de35cc_0
+  - libssh2=1.10.0=h52ee1ee_2
+  - libtiff=4.2.0=hdb42f99_1
+  - libwebp=1.2.2=h56c3ce4_0
+  - libwebp-base=1.2.2=hca72f7f_0
+  - libzlib=1.2.12=hfe4f2af_2
+  - lz4-c=1.9.3=h23ab428_1
+  - markdown=3.4.1=pyhd8ed1ab_0
+  - matplotlib=3.5.1=py310hecd8cb5_1
+  - matplotlib-base=3.5.1=py310hfb0c5b7_1
+  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
+  - mkl=2021.4.0=hecd8cb5_637
+  - mkl-service=2.4.0=py310hca72f7f_0
+  - mkl_fft=1.3.1=py310hf879493_0
+  - mkl_random=1.2.2=py310hc081a56_0
+  - multidict=6.0.2=py310h1961e1f_1
+  - munkres=1.1.4=py_0
+  - ncurses=6.3=h96cf925_1
+  - nest-asyncio=1.5.5=py310hecd8cb5_0
+  - numexpr=2.8.3=py310hdcd3fac_0
+  - numpy=1.22.3=py310hdcd3fac_0
+  - numpy-base=1.22.3=py310hfd2de13_0
+  - oauthlib=3.2.0=pyhd8ed1ab_0
+  - openssl=1.1.1q=hca72f7f_0
+  - opt_einsum=3.3.0=pyhd8ed1ab_1
+  - packaging=21.3=pyhd3eb1b0_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=9.2.0=py310hde71d04_1
+  - pip=22.1.2=py310hecd8cb5_0
+  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
+  - protobuf=3.20.1=py310hd4537e4_0
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
+  - pybtex=0.24.0=pyhd8ed1ab_2
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd3eb1b0_0
+  - pyjwt=2.4.0=pyhd8ed1ab_0
+  - pylatexenc=2.10=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pysocks=1.7.1=py310h2ec42d9_5
+  - python=3.10.5=hdaaf3db_0_cpython
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python-flatbuffers=2.0=pyhd8ed1ab_0
+  - python_abi=3.10=2_cp310
+  - pytz=2022.1=py310hecd8cb5_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py310h1961e1f_4
+  - pyzmq=23.2.0=py310he9d5cce_0
+  - re2=2022.06.01=hb486fe8_0
+  - readline=8.1.2=h3899abd_0
+  - requests=2.28.1=pyhd8ed1ab_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.9=pyhd8ed1ab_0
+  - scipy=1.7.3=py310h3dd3380_0
+  - seaborn=0.11.2=pyhd3eb1b0_0
+  - setuptools=63.2.0=py310h2ec42d9_0
+  - six=1.16.0=pyh6c4a22f_0
+  - snappy=1.1.9=h6e38e02_1
+  - sqlite=3.39.1=hd9f0692_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - tensorboard=2.8.0=pyhd8ed1ab_1
+  - tensorboard-data-server=0.6.0=py310hd6fa1ae_2
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
+  - tensorflow=2.8.1=cpu_py310h22f808f_0
+  - tensorflow-base=2.8.1=cpu_py310h196d2ec_0
+  - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0
+  - termcolor=1.1.0=pyhd8ed1ab_3
+  - tk=8.6.12=h5dbffcc_0
+  - tornado=6.1=py310hca72f7f_0
+  - tqdm=4.64.0=pyhd8ed1ab_0
+  - traitlets=5.1.1=pyhd3eb1b0_0
+  - typing-extensions=4.3.0=hd8ed1ab_0
+  - typing_extensions=4.3.0=pyha770c72_0
+  - tzdata=2022a=h191b570_0
+  - urllib3=1.26.10=pyhd8ed1ab_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - werkzeug=2.1.2=pyhd8ed1ab_1
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - wrapt=1.14.1=py310h6c45266_0
+  - xz=5.2.5=haf1e3a3_1
+  - yaml=0.2.5=h0d85af4_2
+  - yarl=1.7.2=py310h1961e1f_2
+  - zeromq=4.3.4=h23ab428_0
+  - zipp=3.8.0=pyhd8ed1ab_0
+  - zlib=1.2.12=hfe4f2af_2
+  - zstd=1.5.2=hcb37349_0
+  - pip:
+    - ethnicolr==0.9.1
+    - pandas==1.4.3
+prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib

From 0881e21fbe0b8af6589c7f4c59ffc59cf96e5590 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 12 Aug 2022 17:03:44 -0400
Subject: [PATCH 29/47] move yml

---
 tests/immaculate/env_js.yml | 178 ++++++++++++++++++++++++++++++++++--
 1 file changed, 168 insertions(+), 10 deletions(-)

diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml
index 6c49a40..6b2fdd3 100644
--- a/tests/immaculate/env_js.yml
+++ b/tests/immaculate/env_js.yml
@@ -1,15 +1,173 @@
 name: cleanBib
 channels:
+  - conda-forge
   - defaults
 dependencies:
-  - pip
-  - python
-  - habanero
-  - pylatexenc
-  - pybtex
-  - bibtexparser
-  - numpy
-  - tensorflow=2.8
-  - ipykernel
-  - seaborn
+  - abseil-cpp=20210324.2=he49afe7_0
+  - absl-py=1.1.0=pyhd8ed1ab_0
+  - aiohttp=3.8.1=py310h1961e1f_1
+  - aiosignal=1.2.0=pyhd8ed1ab_0
+  - appnope=0.1.2=py310hecd8cb5_1001
+  - asttokens=2.0.5=pyhd3eb1b0_0
+  - astunparse=1.6.3=pyhd8ed1ab_0
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - bibtexparser=1.3.0=pyhd8ed1ab_0
+  - blas=1.0=mkl
+  - blinker=1.4=py_1
+  - bottleneck=1.3.5=py310h4e76f89_0
+  - brotli=1.0.9=hca72f7f_7
+  - brotli-bin=1.0.9=hca72f7f_7
+  - brotlipy=0.7.0=py310h1961e1f_1004
+  - bzip2=1.0.8=h0d85af4_4
+  - c-ares=1.18.1=h0d85af4_0
+  - ca-certificates=2022.07.19=hecd8cb5_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.6.15=py310hecd8cb5_0
+  - cffi=1.15.1=py310h96bbf6e_0
+  - charset-normalizer=2.1.0=pyhd8ed1ab_0
+  - click=8.1.3=py310h2ec42d9_0
+  - colorama=0.4.5=pyhd8ed1ab_0
+  - cryptography=37.0.1=py310hf6deb26_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - debugpy=1.5.1=py310he9d5cce_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - entrypoints=0.4=py310hecd8cb5_0
+  - executing=0.8.3=pyhd3eb1b0_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.11.0=hd8bbffd_0
+  - frozenlist=1.3.0=py310h1961e1f_1
+  - gast=0.5.3=pyhd8ed1ab_0
+  - giflib=5.2.1=hbcb3906_2
+  - google-auth=2.9.1=pyh6c4a22f_0
+  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
+  - google-pasta=0.2.0=pyh8c360ce_0
+  - grpc-cpp=1.45.2=h360b188_4
+  - grpcio=1.45.0=py310h1da61bb_0
+  - h5py=3.6.0=py310h6c517f8_0
+  - habanero=1.2.2=pyh6c4a22f_0
+  - hdf5=1.10.6=hdbbcd12_0
+  - icu=70.1=h96cf925_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib-metadata=4.11.4=py310h2ec42d9_0
+  - intel-openmp=2021.4.0=hecd8cb5_3538
+  - ipykernel=6.9.1=py310hecd8cb5_0
+  - ipython=8.4.0=py310hecd8cb5_0
+  - jedi=0.18.1=py310hecd8cb5_1
+  - jpeg=9e=hac89ed1_2
+  - jupyter_client=7.2.2=py310hecd8cb5_0
+  - jupyter_core=4.10.0=py310hecd8cb5_0
+  - keras=2.8.0=pyhd8ed1ab_0
+  - keras-preprocessing=1.1.2=pyhd8ed1ab_0
+  - kiwisolver=1.4.2=py310he9d5cce_0
+  - krb5=1.19.3=hb49756b_0
+  - latexcodec=2.0.1=pyh9f0ad1d_0
+  - lcms2=2.12=hf1fd2bf_0
+  - libbrotlicommon=1.0.9=hca72f7f_7
+  - libbrotlidec=1.0.9=hca72f7f_7
+  - libbrotlienc=1.0.9=hca72f7f_7
+  - libcurl=7.83.1=h372c54d_0
+  - libcxx=14.0.6=hce7ea42_0
+  - libedit=3.1.20191231=h0678c8f_2
+  - libev=4.33=haf1e3a3_1
+  - libffi=3.4.2=h0d85af4_5
+  - libgfortran=3.0.1=0
+  - libnghttp2=1.47.0=h942079c_0
+  - libpng=1.6.37=h5a3d3bf_3
+  - libprotobuf=3.20.1=h2292cb8_0
+  - libsodium=1.0.18=h1de35cc_0
+  - libssh2=1.10.0=h52ee1ee_2
+  - libtiff=4.2.0=hdb42f99_1
+  - libwebp=1.2.2=h56c3ce4_0
+  - libwebp-base=1.2.2=hca72f7f_0
+  - libzlib=1.2.12=hfe4f2af_2
+  - lz4-c=1.9.3=h23ab428_1
+  - markdown=3.4.1=pyhd8ed1ab_0
+  - matplotlib=3.5.1=py310hecd8cb5_1
+  - matplotlib-base=3.5.1=py310hfb0c5b7_1
+  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
+  - mkl=2021.4.0=hecd8cb5_637
+  - mkl-service=2.4.0=py310hca72f7f_0
+  - mkl_fft=1.3.1=py310hf879493_0
+  - mkl_random=1.2.2=py310hc081a56_0
+  - multidict=6.0.2=py310h1961e1f_1
+  - munkres=1.1.4=py_0
+  - ncurses=6.3=h96cf925_1
+  - nest-asyncio=1.5.5=py310hecd8cb5_0
+  - numexpr=2.8.3=py310hdcd3fac_0
+  - numpy=1.22.3=py310hdcd3fac_0
+  - numpy-base=1.22.3=py310hfd2de13_0
+  - oauthlib=3.2.0=pyhd8ed1ab_0
+  - openssl=1.1.1q=hca72f7f_0
+  - opt_einsum=3.3.0=pyhd8ed1ab_1
+  - packaging=21.3=pyhd3eb1b0_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - pexpect=4.8.0=pyhd3eb1b0_3
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=9.2.0=py310hde71d04_1
+  - pip=22.1.2=py310hecd8cb5_0
+  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
+  - protobuf=3.20.1=py310hd4537e4_0
+  - ptyprocess=0.7.0=pyhd3eb1b0_2
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
+  - pybtex=0.24.0=pyhd8ed1ab_2
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd3eb1b0_0
+  - pyjwt=2.4.0=pyhd8ed1ab_0
+  - pylatexenc=2.10=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pysocks=1.7.1=py310h2ec42d9_5
+  - python=3.10.5=hdaaf3db_0_cpython
+  - python-dateutil=2.8.2=pyhd3eb1b0_0
+  - python-flatbuffers=2.0=pyhd8ed1ab_0
+  - python_abi=3.10=2_cp310
+  - pytz=2022.1=py310hecd8cb5_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py310h1961e1f_4
+  - pyzmq=23.2.0=py310he9d5cce_0
+  - re2=2022.06.01=hb486fe8_0
+  - readline=8.1.2=h3899abd_0
+  - requests=2.28.1=pyhd8ed1ab_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.9=pyhd8ed1ab_0
+  - scipy=1.7.3=py310h3dd3380_0
+  - seaborn=0.11.2=pyhd3eb1b0_0
+  - setuptools=63.2.0=py310h2ec42d9_0
+  - six=1.16.0=pyh6c4a22f_0
+  - snappy=1.1.9=h6e38e02_1
+  - sqlite=3.39.1=hd9f0692_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - tensorboard=2.8.0=pyhd8ed1ab_1
+  - tensorboard-data-server=0.6.0=py310hd6fa1ae_2
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
+  - tensorflow=2.8.1=cpu_py310h22f808f_0
+  - tensorflow-base=2.8.1=cpu_py310h196d2ec_0
+  - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0
+  - termcolor=1.1.0=pyhd8ed1ab_3
+  - tk=8.6.12=h5dbffcc_0
+  - tornado=6.1=py310hca72f7f_0
+  - tqdm=4.64.0=pyhd8ed1ab_0
+  - traitlets=5.1.1=pyhd3eb1b0_0
+  - typing-extensions=4.3.0=hd8ed1ab_0
+  - typing_extensions=4.3.0=pyha770c72_0
+  - tzdata=2022a=h191b570_0
+  - urllib3=1.26.10=pyhd8ed1ab_0
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - werkzeug=2.1.2=pyhd8ed1ab_1
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - wrapt=1.14.1=py310h6c45266_0
+  - xz=5.2.5=haf1e3a3_1
+  - yaml=0.2.5=h0d85af4_2
+  - yarl=1.7.2=py310h1961e1f_2
+  - zeromq=4.3.4=h23ab428_0
+  - zipp=3.8.0=pyhd8ed1ab_0
+  - zlib=1.2.12=hfe4f2af_2
+  - zstd=1.5.2=hcb37349_0
+  - pip:
+    - ethnicolr==0.9.1
+    - pandas==1.4.3
 prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib

From 450db3f50f3b3761e248d993ffdfabb7c9a86348 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Fri, 12 Aug 2022 17:04:26 -0400
Subject: [PATCH 30/47] move yml

---
 env_js.yml | 173 -----------------------------------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 env_js.yml

diff --git a/env_js.yml b/env_js.yml
deleted file mode 100644
index 6b2fdd3..0000000
--- a/env_js.yml
+++ /dev/null
@@ -1,173 +0,0 @@
-name: cleanBib
-channels:
-  - conda-forge
-  - defaults
-dependencies:
-  - abseil-cpp=20210324.2=he49afe7_0
-  - absl-py=1.1.0=pyhd8ed1ab_0
-  - aiohttp=3.8.1=py310h1961e1f_1
-  - aiosignal=1.2.0=pyhd8ed1ab_0
-  - appnope=0.1.2=py310hecd8cb5_1001
-  - asttokens=2.0.5=pyhd3eb1b0_0
-  - astunparse=1.6.3=pyhd8ed1ab_0
-  - async-timeout=4.0.2=pyhd8ed1ab_0
-  - attrs=21.4.0=pyhd8ed1ab_0
-  - backcall=0.2.0=pyhd3eb1b0_0
-  - bibtexparser=1.3.0=pyhd8ed1ab_0
-  - blas=1.0=mkl
-  - blinker=1.4=py_1
-  - bottleneck=1.3.5=py310h4e76f89_0
-  - brotli=1.0.9=hca72f7f_7
-  - brotli-bin=1.0.9=hca72f7f_7
-  - brotlipy=0.7.0=py310h1961e1f_1004
-  - bzip2=1.0.8=h0d85af4_4
-  - c-ares=1.18.1=h0d85af4_0
-  - ca-certificates=2022.07.19=hecd8cb5_0
-  - cachetools=5.0.0=pyhd8ed1ab_0
-  - certifi=2022.6.15=py310hecd8cb5_0
-  - cffi=1.15.1=py310h96bbf6e_0
-  - charset-normalizer=2.1.0=pyhd8ed1ab_0
-  - click=8.1.3=py310h2ec42d9_0
-  - colorama=0.4.5=pyhd8ed1ab_0
-  - cryptography=37.0.1=py310hf6deb26_0
-  - cycler=0.11.0=pyhd3eb1b0_0
-  - debugpy=1.5.1=py310he9d5cce_0
-  - decorator=5.1.1=pyhd3eb1b0_0
-  - entrypoints=0.4=py310hecd8cb5_0
-  - executing=0.8.3=pyhd3eb1b0_0
-  - fonttools=4.25.0=pyhd3eb1b0_0
-  - freetype=2.11.0=hd8bbffd_0
-  - frozenlist=1.3.0=py310h1961e1f_1
-  - gast=0.5.3=pyhd8ed1ab_0
-  - giflib=5.2.1=hbcb3906_2
-  - google-auth=2.9.1=pyh6c4a22f_0
-  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
-  - google-pasta=0.2.0=pyh8c360ce_0
-  - grpc-cpp=1.45.2=h360b188_4
-  - grpcio=1.45.0=py310h1da61bb_0
-  - h5py=3.6.0=py310h6c517f8_0
-  - habanero=1.2.2=pyh6c4a22f_0
-  - hdf5=1.10.6=hdbbcd12_0
-  - icu=70.1=h96cf925_0
-  - idna=3.3=pyhd8ed1ab_0
-  - importlib-metadata=4.11.4=py310h2ec42d9_0
-  - intel-openmp=2021.4.0=hecd8cb5_3538
-  - ipykernel=6.9.1=py310hecd8cb5_0
-  - ipython=8.4.0=py310hecd8cb5_0
-  - jedi=0.18.1=py310hecd8cb5_1
-  - jpeg=9e=hac89ed1_2
-  - jupyter_client=7.2.2=py310hecd8cb5_0
-  - jupyter_core=4.10.0=py310hecd8cb5_0
-  - keras=2.8.0=pyhd8ed1ab_0
-  - keras-preprocessing=1.1.2=pyhd8ed1ab_0
-  - kiwisolver=1.4.2=py310he9d5cce_0
-  - krb5=1.19.3=hb49756b_0
-  - latexcodec=2.0.1=pyh9f0ad1d_0
-  - lcms2=2.12=hf1fd2bf_0
-  - libbrotlicommon=1.0.9=hca72f7f_7
-  - libbrotlidec=1.0.9=hca72f7f_7
-  - libbrotlienc=1.0.9=hca72f7f_7
-  - libcurl=7.83.1=h372c54d_0
-  - libcxx=14.0.6=hce7ea42_0
-  - libedit=3.1.20191231=h0678c8f_2
-  - libev=4.33=haf1e3a3_1
-  - libffi=3.4.2=h0d85af4_5
-  - libgfortran=3.0.1=0
-  - libnghttp2=1.47.0=h942079c_0
-  - libpng=1.6.37=h5a3d3bf_3
-  - libprotobuf=3.20.1=h2292cb8_0
-  - libsodium=1.0.18=h1de35cc_0
-  - libssh2=1.10.0=h52ee1ee_2
-  - libtiff=4.2.0=hdb42f99_1
-  - libwebp=1.2.2=h56c3ce4_0
-  - libwebp-base=1.2.2=hca72f7f_0
-  - libzlib=1.2.12=hfe4f2af_2
-  - lz4-c=1.9.3=h23ab428_1
-  - markdown=3.4.1=pyhd8ed1ab_0
-  - matplotlib=3.5.1=py310hecd8cb5_1
-  - matplotlib-base=3.5.1=py310hfb0c5b7_1
-  - matplotlib-inline=0.1.2=pyhd3eb1b0_2
-  - mkl=2021.4.0=hecd8cb5_637
-  - mkl-service=2.4.0=py310hca72f7f_0
-  - mkl_fft=1.3.1=py310hf879493_0
-  - mkl_random=1.2.2=py310hc081a56_0
-  - multidict=6.0.2=py310h1961e1f_1
-  - munkres=1.1.4=py_0
-  - ncurses=6.3=h96cf925_1
-  - nest-asyncio=1.5.5=py310hecd8cb5_0
-  - numexpr=2.8.3=py310hdcd3fac_0
-  - numpy=1.22.3=py310hdcd3fac_0
-  - numpy-base=1.22.3=py310hfd2de13_0
-  - oauthlib=3.2.0=pyhd8ed1ab_0
-  - openssl=1.1.1q=hca72f7f_0
-  - opt_einsum=3.3.0=pyhd8ed1ab_1
-  - packaging=21.3=pyhd3eb1b0_0
-  - parso=0.8.3=pyhd3eb1b0_0
-  - pexpect=4.8.0=pyhd3eb1b0_3
-  - pickleshare=0.7.5=pyhd3eb1b0_1003
-  - pillow=9.2.0=py310hde71d04_1
-  - pip=22.1.2=py310hecd8cb5_0
-  - prompt-toolkit=3.0.20=pyhd3eb1b0_0
-  - protobuf=3.20.1=py310hd4537e4_0
-  - ptyprocess=0.7.0=pyhd3eb1b0_2
-  - pure_eval=0.2.2=pyhd3eb1b0_0
-  - pyasn1=0.4.8=py_0
-  - pyasn1-modules=0.2.7=py_0
-  - pybtex=0.24.0=pyhd8ed1ab_2
-  - pycparser=2.21=pyhd8ed1ab_0
-  - pygments=2.11.2=pyhd3eb1b0_0
-  - pyjwt=2.4.0=pyhd8ed1ab_0
-  - pylatexenc=2.10=pyhd8ed1ab_0
-  - pyopenssl=22.0.0=pyhd8ed1ab_0
-  - pyparsing=3.0.9=pyhd8ed1ab_0
-  - pysocks=1.7.1=py310h2ec42d9_5
-  - python=3.10.5=hdaaf3db_0_cpython
-  - python-dateutil=2.8.2=pyhd3eb1b0_0
-  - python-flatbuffers=2.0=pyhd8ed1ab_0
-  - python_abi=3.10=2_cp310
-  - pytz=2022.1=py310hecd8cb5_0
-  - pyu2f=0.1.5=pyhd8ed1ab_0
-  - pyyaml=6.0=py310h1961e1f_4
-  - pyzmq=23.2.0=py310he9d5cce_0
-  - re2=2022.06.01=hb486fe8_0
-  - readline=8.1.2=h3899abd_0
-  - requests=2.28.1=pyhd8ed1ab_0
-  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
-  - rsa=4.9=pyhd8ed1ab_0
-  - scipy=1.7.3=py310h3dd3380_0
-  - seaborn=0.11.2=pyhd3eb1b0_0
-  - setuptools=63.2.0=py310h2ec42d9_0
-  - six=1.16.0=pyh6c4a22f_0
-  - snappy=1.1.9=h6e38e02_1
-  - sqlite=3.39.1=hd9f0692_0
-  - stack_data=0.2.0=pyhd3eb1b0_0
-  - tensorboard=2.8.0=pyhd8ed1ab_1
-  - tensorboard-data-server=0.6.0=py310hd6fa1ae_2
-  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
-  - tensorflow=2.8.1=cpu_py310h22f808f_0
-  - tensorflow-base=2.8.1=cpu_py310h196d2ec_0
-  - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0
-  - termcolor=1.1.0=pyhd8ed1ab_3
-  - tk=8.6.12=h5dbffcc_0
-  - tornado=6.1=py310hca72f7f_0
-  - tqdm=4.64.0=pyhd8ed1ab_0
-  - traitlets=5.1.1=pyhd3eb1b0_0
-  - typing-extensions=4.3.0=hd8ed1ab_0
-  - typing_extensions=4.3.0=pyha770c72_0
-  - tzdata=2022a=h191b570_0
-  - urllib3=1.26.10=pyhd8ed1ab_0
-  - wcwidth=0.2.5=pyhd3eb1b0_0
-  - werkzeug=2.1.2=pyhd8ed1ab_1
-  - wheel=0.37.1=pyhd8ed1ab_0
-  - wrapt=1.14.1=py310h6c45266_0
-  - xz=5.2.5=haf1e3a3_1
-  - yaml=0.2.5=h0d85af4_2
-  - yarl=1.7.2=py310h1961e1f_2
-  - zeromq=4.3.4=h23ab428_0
-  - zipp=3.8.0=pyhd8ed1ab_0
-  - zlib=1.2.12=hfe4f2af_2
-  - zstd=1.5.2=hcb37349_0
-  - pip:
-    - ethnicolr==0.9.1
-    - pandas==1.4.3
-prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib

From 817337928bf8a281d794cfa9991ca249bb07dc99 Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 12 Aug 2022 17:08:39 -0400
Subject: [PATCH 31/47] trying third env

---
 tests/env_js.yml            | 173 ++++++++++++++++++++++++++++++++++++
 tests/immaculate/env_js.yml |  15 ----
 2 files changed, 173 insertions(+), 15 deletions(-)
 create mode 100644 tests/env_js.yml
 delete mode 100644 tests/immaculate/env_js.yml

diff --git a/tests/env_js.yml b/tests/env_js.yml
new file mode 100644
index 0000000..26d4341
--- /dev/null
+++ b/tests/env_js.yml
@@ -0,0 +1,173 @@
+name: cleanBib
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - abseil-cpp=20210324.2
+  - absl-py=1.1.0
+  - aiohttp=3.8.1
+  - aiosignal=1.2.0
+  - appnope=0.1.2
+  - asttokens=2.0.5
+  - astunparse=1.6.3
+  - async-timeout=4.0.2
+  - attrs=21.4.0
+  - backcall=0.2.0
+  - bibtexparser=1.3.0
+  - blas=1.0
+  - blinker=1.4
+  - bottleneck=1.3.5
+  - brotli=1.0.9
+  - brotli-bin=1.0.9
+  - brotlipy=0.7.0
+  - bzip2=1.0.8
+  - c-ares=1.18.1
+  - ca-certificates=2022.07.19
+  - cachetools=5.0.0
+  - certifi=2022.6.15
+  - cffi=1.15.1
+  - charset-normalizer=2.1.0
+  - click=8.1.3
+  - colorama=0.4.5
+  - cryptography=37.0.1
+  - cycler=0.11.0
+  - debugpy=1.5.1
+  - decorator=5.1.1
+  - entrypoints=0.4
+  - executing=0.8.3
+  - fonttools=4.25.0
+  - freetype=2.11.0
+  - frozenlist=1.3.0
+  - gast=0.5.3
+  - giflib=5.2.1
+  - google-auth=2.9.1
+  - google-auth-oauthlib=0.4.6
+  - google-pasta=0.2.0
+  - grpc-cpp=1.45.2
+  - grpcio=1.45.0
+  - h5py=3.6.0
+  - habanero=1.2.2
+  - hdf5=1.10.6
+  - icu=70.1
+  - idna=3.3
+  - importlib-metadata=4.11.4
+  - intel-openmp=2021.4.0
+  - ipykernel=6.9.1
+  - ipython=8.4.0
+  - jedi=0.18.1
+  - jpeg=9e
+  - jupyter_client=7.2.2
+  - jupyter_core=4.10.0
+  - keras=2.8.0
+  - keras-preprocessing=1.1.2
+  - kiwisolver=1.4.2
+  - krb5=1.19.3
+  - latexcodec=2.0.1
+  - lcms2=2.12
+  - libbrotlicommon=1.0.9
+  - libbrotlidec=1.0.9
+  - libbrotlienc=1.0.9
+  - libcurl=7.83.1
+  - libcxx=14.0.6
+  - libedit=3.1.20191231
+  - libev=4.33
+  - libffi=3.4.2
+  - libgfortran=3.0.1
+  - libnghttp2=1.47.0
+  - libpng=1.6.37
+  - libprotobuf=3.20.1
+  - libsodium=1.0.18
+  - libssh2=1.10.0
+  - libtiff=4.2.0
+  - libwebp=1.2.2
+  - libwebp-base=1.2.2
+  - libzlib=1.2.12
+  - lz4-c=1.9.3
+  - markdown=3.4.1
+  - matplotlib=3.5.1
+  - matplotlib-base=3.5.1
+  - matplotlib-inline=0.1.2
+  - mkl=2021.4.0
+  - mkl-service=2.4.0
+  - mkl_fft=1.3.1
+  - mkl_random=1.2.2
+  - multidict=6.0.2
+  - munkres=1.1.4
+  - ncurses=6.3
+  - nest-asyncio=1.5.5
+  - numexpr=2.8.3
+  - numpy=1.22.3
+  - numpy-base=1.22.3
+  - oauthlib=3.2.0
+  - openssl=1.1.1q
+  - opt_einsum=3.3.0
+  - packaging=21.3
+  - parso=0.8.3
+  - pexpect=4.8.0
+  - pickleshare=0.7.5
+  - pillow=9.2.0
+  - pip=22.1.2
+  - prompt-toolkit=3.0.20
+  - protobuf=3.20.1
+  - ptyprocess=0.7.0
+  - pure_eval=0.2.2
+  - pyasn1=0.4.8
+  - pyasn1-modules=0.2.7
+  - pybtex=0.24.0
+  - pycparser=2.21
+  - pygments=2.11.2
+  - pyjwt=2.4.0
+  - pylatexenc=2.10
+  - pyopenssl=22.0.0
+  - pyparsing=3.0.9
+  - pysocks=1.7.1
+  - python=3.10.5
+  - python-dateutil=2.8.2
+  - python-flatbuffers=2.0
+  - python_abi=3.10
+  - pytz=2022.1
+  - pyu2f=0.1.5
+  - pyyaml=6.0
+  - pyzmq=23.2.0
+  - re2=2022.06.01
+  - readline=8.1.2
+  - requests=2.28.1
+  - requests-oauthlib=1.3.1
+  - rsa=4.9
+  - scipy=1.7.3
+  - seaborn=0.11.2
+  - setuptools=63.2.0
+  - six=1.16.0
+  - snappy=1.1.9
+  - sqlite=3.39.1
+  - stack_data=0.2.0
+  - tensorboard=2.8.0
+  - tensorboard-data-server=0.6.0
+  - tensorboard-plugin-wit=1.8.1
+  - tensorflow=2.8.1
+  - tensorflow-base=2.8.1
+  - tensorflow-estimator=2.8.1
+  - termcolor=1.1.0
+  - tk=8.6.12
+  - tornado=6.1
+  - tqdm=4.64.0
+  - traitlets=5.1.1
+  - typing-extensions=4.3.0
+  - typing_extensions=4.3.0
+  - tzdata=2022a
+  - urllib3=1.26.10
+  - wcwidth=0.2.5
+  - werkzeug=2.1.2
+  - wheel=0.37.1
+  - wrapt=1.14.1
+  - xz=5.2.5
+  - yaml=0.2.5
+  - yarl=1.7.2
+  - zeromq=4.3.4
+  - zipp=3.8.0
+  - zlib=1.2.12
+  - zstd=1.5.2
+  - pip:
+    - ethnicolr==0.9.1
+    - pandas==1.4.3
+prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib
diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml
deleted file mode 100644
index 6c49a40..0000000
--- a/tests/immaculate/env_js.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-name: cleanBib
-channels:
-  - defaults
-dependencies:
-  - pip
-  - python
-  - habanero
-  - pylatexenc
-  - pybtex
-  - bibtexparser
-  - numpy
-  - tensorflow=2.8
-  - ipykernel
-  - seaborn
-prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib

From cf988f57ca2b0066196d08bd0837279a73a1a31e Mon Sep 17 00:00:00 2001
From: Stiso <jeni.stiso@gmail.com>
Date: Fri, 12 Aug 2022 17:31:21 -0400
Subject: [PATCH 32/47] working on query reporting

---
 cleanBib.ipynb | 279 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 263 insertions(+), 16 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 1133d54..60d6ac3 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "kernel": "Python 3"
    },
@@ -70,7 +70,7 @@
     "import seaborn as sns\n",
     "\n",
     "cr = Crossref()\n",
-    "homedir = '/home/jovyan/'\n",
+    "homedir = '/Users/stisoj/Documents/dev/cleanBib/tests/immaculate/'\n",
     "bib_files = glob.glob(homedir + '*.bib')\n",
     "paper_aux_file = glob.glob(homedir + '*.aux')\n",
     "paper_bib_file = 'library_paper.bib'\n",
@@ -117,14 +117,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1: buzsaki2013memory\n",
+      "2: Lundine2019\t\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\n",
+      "3: zurn2020network\n",
+      "4: moralia2005\n",
+      "5: bassett2022curious\n",
+      "6: fake2022  <-- self-citation\n",
+      "7: jurafsky2018n\t\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\n",
+      "8: mitchell2013gendered\n",
+      "9: chatterjee2021gender\n",
+      "10: fulvio2021imbalance\n",
+      "11: ethnicolr2022black\n",
+      "12: ethnicolr2022hispanic\n",
+      "13: ethnicolr2022asian\n",
+      "14: ethnicolr2022white\n"
+     ]
+    }
+   ],
    "source": [
-    "yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial'\n",
-    "yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial'\n",
+    "yourFirstAuthor = 'Stiso, Jennifer'\n",
+    "yourLastAuthor = 'Zhou, Dale'\n",
     "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n",
     "checkingPublishedArticle = False\n",
     "\n",
@@ -143,11 +164,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "kernel": "R"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "STOP: Please remove self-citations. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n",
+      "['fake2022']\n",
+      "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n",
+      "['Lundine2019', 'jurafsky2018n']\n",
+      "Only continue if you've run steps 2, and this code no longer returns errors.\n"
+     ]
+    }
+   ],
    "source": [
     "bib_check(homedir)"
    ]
@@ -186,27 +219,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {
     "kernel": "R"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Remaining credits: 262\n",
+      "This should use (at most) 25 credits, saving you approx 1 credit(s) by storing queries.\n"
+     ]
+    }
+   ],
    "source": [
-    "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n",
+    "genderAPI_key = '&key='\n",
     "\n",
     "# The following saves the api key to a txt file just to be reloaded by the next cell\n",
     "with open(\"genderAPIkey.txt\", 'w') as f:\n",
     "    f.write(genderAPI_key)\n",
     "\n",
     "# Check your credit balance\n",
+    "authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')\n",
     "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n",
     "response = urlopen(url)\n",
     "decoded = response.read().decode('utf-8')\n",
     "decoded_json = json.loads(decoded)\n",
     "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n",
-    "print('This should use (at most) %d credits, '%len(np.unique(authors_full_list)) + \\\n",
-    "      'saving you approx %d'%(len(authors_full_list)-len(np.unique(authors_full_list))) + \\\n",
-    "      ' credits if results are stored.')"
+    "print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \\\n",
+    "        'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-\n",
+    "                            (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \\\n",
+    "      ' credit(s) by storing queries.')"
    ]
   },
   {
@@ -224,11 +268,214 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "first author is Jennifer Stiso \n",
+      "last author is Dale Zhou \n",
+      "we don't count these, but check the predictions file to ensure your names did not slip through!\n",
+      "looping through your references, predicting gender and race\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/23 [00:00<?, ?it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      "  4%|▍         | 1/23 [00:01<00:24,  1.11s/it]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      "  9%|▊         | 2/23 [00:01<00:19,  1.06it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 13%|█▎        | 3/23 [00:02<00:17,  1.13it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 17%|█▋        | 4/23 [00:03<00:16,  1.17it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 22%|██▏       | 5/23 [00:03<00:12,  1.44it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 35%|███▍      | 8/23 [00:04<00:06,  2.30it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 39%|███▉      | 9/23 [00:05<00:06,  2.31it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 43%|████▎     | 10/23 [00:06<00:06,  1.91it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 48%|████▊     | 11/23 [00:06<00:07,  1.68it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 52%|█████▏    | 12/23 [00:07<00:07,  1.54it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 57%|█████▋    | 13/23 [00:08<00:05,  1.72it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 65%|██████▌   | 15/23 [00:08<00:04,  1.98it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 70%|██████▉   | 16/23 [00:09<00:04,  1.70it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 74%|███████▍  | 17/23 [00:10<00:03,  1.82it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 78%|███████▊  | 18/23 [00:10<00:03,  1.62it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 83%|████████▎ | 19/23 [00:11<00:02,  1.50it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 87%|████████▋ | 20/23 [00:12<00:02,  1.41it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 91%|█████████▏| 21/23 [00:13<00:01,  1.35it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      " 96%|█████████▌| 22/23 [00:13<00:00,  1.52it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  fa_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  la_data = np.array(\n",
+      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
+      "100%|██████████| 23/23 [00:14<00:00,  1.58it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Queried gender api 35 times out of 46 entries\n",
+      "Queried race/ethnicity api 36 times out of 46 entries\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "/Users/stisoj/opt/anaconda3/envs/cleanBib/lib/python3.10/site-packages/numpy/core/_methods.py:163: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  arr = asanyarray(a)\n"
+     ]
+    }
+   ],
    "source": [
     "f = open(\"genderAPIkey.txt\", \"r\")\n",
     "genderAPI_key = f.readline().replace('\\n', '')\n",

From 28f1350abc1d90108fa5b41c96fc8b79c3485667 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Sun, 27 Nov 2022 23:54:11 -0500
Subject: [PATCH 33/47] relax dependencies

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1b88742..ba8ebce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -72,7 +72,7 @@ pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==9.1.0
 prometheus-client==0.14.1
-prompt-toolkit==3.0.29
+prompt-toolkit
 protobuf==3.20.1
 psutil==5.9.0
 ptyprocess==0.7.0

From bd523551ccdde6a597748989be75efddb12fa226 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 00:11:12 -0500
Subject: [PATCH 34/47] relax dependencies

---
 requirements.txt | 200 +++++++++++++++++++++++------------------------
 1 file changed, 100 insertions(+), 100 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ba8ebce..247836d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,121 +1,121 @@
-absl-py==0.15.0
-appnope==0.1.3
-argon2-cffi==21.3.0
-argon2-cffi-bindings==21.2.0
-asttokens==2.0.5
-astunparse==1.6.3
-attrs==21.4.0
-backcall==0.2.0
-beautifulsoup4==4.11.1
+absl-py
+appnope
+argon2-cffi
+argon2-cffi-bindings
+asttokens
+astunparse
+attrs
+backcall
+beautifulsoup4
 bibtexparser==1.2.0
-bleach==5.0.0
-cachetools==4.2.4
-certifi==2021.10.8
-cffi==1.15.0
-charset-normalizer==2.0.12
-cycler==0.11.0
-debugpy==1.6.0
-decorator==4.4.2
-defusedxml==0.7.1
-entrypoints==0.4
+bleach
+cachetools
+certifi
+cffi
+charset-normalizer
+cycler
+debugpy
+decorator
+defusedxml
+entrypoints
 ethnicolr==0.8.1
-executing==0.8.3
-fastjsonschema==2.15.3
-flatbuffers==1.12
-fonttools==4.33.3
-folium==0.2.1
-future==0.18.2
-gast==0.4.0
-google-auth==1.35.0
-google-auth-oauthlib==0.4.6
-google-pasta==0.2.0
-grpcio==1.34.1
+executing
+fastjsonschema
+flatbuffers
+fonttools
+folium
+future
+gast
+google-auth
+google-auth-oauthlib
+google-pasta
+grpcio
 h5py==3.1.0
 habanero==1.2.0
-idna==3.3
-imgaug==0.2.6
-importlib-metadata==4.11.3
-ipykernel==4.10
-ipython==5.5.0
-jedi==0.18.1
-Jinja2==2.11.3
-jsonschema==4.4.0
-jupyter==1.0.0
-jupyter-client==7.3.0
-jupyter-console==6.4.3
-jupyter-core==4.10.0
-jupyterlab-pygments==0.2.2
-jupyterlab-widgets==1.1.0
+idna
+imgaug
+importlib-metadata
+ipykernel
+ipython
+jedi
+Jinja2
+jsonschema
+jupyter
+jupyter-client
+jupyter-console
+jupyter-core
+jupyterlab-pygments
+jupyterlab-widgets
 keras==2.8.0
 keras-nightly==2.5.0.dev2021032900
 Keras-Preprocessing==1.1.2
-kiwisolver==1.4.2
+kiwisolver
 latexcodec==2.0.1
-Markdown==3.3.6
-MarkupSafe==2.1.1
-matplotlib==3.5.1
-matplotlib-inline==0.1.3
-mistune==0.8.4
-nbclient==0.6.0
-nbconvert==6.5.0
-nbformat==5.3.0
-nest-asyncio==1.5.5
-notebook==5.3.0
+Markdown
+MarkupSafe
+matplotlib
+matplotlib-inline
+mistune
+nbclient
+nbconvert
+nbformat
+nest-asyncio
+notebook
 numpy==1.21
-oauthlib==3.2.0
-opt-einsum==3.3.0
-packaging==21.3
+oauthlib
+opt-einsum
+packaging
 pandas==1.3.5
-pandocfilters==1.5.0
-parso==0.8.3
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow==9.1.0
-prometheus-client==0.14.1
+pandocfilters
+parso
+pexpect
+pickleshare
+Pillow
+prometheus-client
 prompt-toolkit
 protobuf==3.20.1
-psutil==5.9.0
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
+psutil
+ptyprocess
+pure-eval
+pyasn1
+pyasn1-modules
 pybtex==0.24.0
-pycparser==2.21
-Pygments==2.12.0
+pycparser
+Pygments
 pylatexenc==2.10
-pyparsing==3.0.8
-pyrsistent==0.18.1
-python-dateutil==2.8.2
-pytz==2022.1
-PyYAML==6.0
-pyzmq==22.3.0
-qtconsole==5.3.0
-QtPy==2.1.0
-requests==2.23.0
-requests-oauthlib==1.3.1
-rsa==4.8
+pyparsing
+pyrsistent
+python-dateutil
+pytz
+PyYAML
+pyzmq
+qtconsole
+QtPy
+requests
+requests-oauthlib
+rsa
 scipy==1.7.3
 seaborn==0.11.2
-Send2Trash==1.8.0
-six==1.15.0
-soupsieve==2.3.2.post1
-stack-data==0.2.0
+Send2Trash
+six
+soupsieve
+stack-data
 tensorboard==2.9.0
 tensorboard-data-server==0.6.1
 tensorboard-plugin-wit==1.8.1
 tensorflow==2.5.2
 tensorflow-estimator==2.5.0
-termcolor==1.1.0
-terminado==0.13.3
-tinycss2==1.1.1
-tornado==5.1.0
-tqdm==4.64.0
-traitlets==5.1.1
-typing-extensions==3.7.4.3
-urllib3==1.26.9
-wcwidth==0.2.5
-webencodings==0.5.1
-Werkzeug==1.0.1
-widgetsnbextension==3.6.0
-wrapt==1.12.1
-zipp==3.8.0
+termcolor
+terminado
+tinycss2
+tornado
+tqdm
+traitlets
+typing-extensions
+urllib3
+wcwidth
+webencodings
+Werkzeug
+widgetsnbextension
+wrapt
+zipp

From 28fc386f6fb9382d934845cc0f0539959700e736 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 00:18:39 -0500
Subject: [PATCH 35/47] relax dependencies

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 247836d..17b5c71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ google-auth
 google-auth-oauthlib
 google-pasta
 grpcio
-h5py==3.1.0
+h5py
 habanero==1.2.0
 idna
 imgaug
@@ -61,7 +61,7 @@ nbconvert
 nbformat
 nest-asyncio
 notebook
-numpy==1.21
+numpy==1.19.2
 oauthlib
 opt-einsum
 packaging

From 8561e75a791e349a94373d90b8c38572b0d0c5ae Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 00:49:34 -0500
Subject: [PATCH 36/47] upgrade ethnicolr

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 17b5c71..6b74e1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ debugpy
 decorator
 defusedxml
 entrypoints
-ethnicolr==0.8.1
+ethnicolr==0.9.1
 executing
 fastjsonschema
 flatbuffers

From dc1950fb9aa0efccd9a18bd5c94bede8050989d3 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 00:56:53 -0500
Subject: [PATCH 37/47] update versions

---
 requirements.txt | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6b74e1b..d4842cd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ astunparse
 attrs
 backcall
 beautifulsoup4
-bibtexparser==1.2.0
+bibtexparser==1.3.0
 bleach
 cachetools
 certifi
@@ -31,7 +31,7 @@ google-auth-oauthlib
 google-pasta
 grpcio
 h5py
-habanero==1.2.0
+habanero==1.2.2
 idna
 imgaug
 importlib-metadata
@@ -47,8 +47,7 @@ jupyter-core
 jupyterlab-pygments
 jupyterlab-widgets
 keras==2.8.0
-keras-nightly==2.5.0.dev2021032900
-Keras-Preprocessing==1.1.2
+keras-preprocessing==1.1.2
 kiwisolver
 latexcodec==2.0.1
 Markdown
@@ -61,11 +60,11 @@ nbconvert
 nbformat
 nest-asyncio
 notebook
-numpy==1.19.2
+numpy==1.22.3
 oauthlib
 opt-einsum
 packaging
-pandas==1.3.5
+pandas==1.4.3
 pandocfilters
 parso
 pexpect
@@ -100,11 +99,12 @@ Send2Trash
 six
 soupsieve
 stack-data
-tensorboard==2.9.0
-tensorboard-data-server==0.6.1
+tensorboard==2.8.0
+tensorboard-data-server==0.6.0
 tensorboard-plugin-wit==1.8.1
-tensorflow==2.5.2
-tensorflow-estimator==2.5.0
+tensorflow==2.8.1
+tensorflow-base==2.8.1
+tensorflow-estimator==2.8.1
 termcolor
 terminado
 tinycss2

From cfc9643080eafa406d65eec678f7933cb69dae7d Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 00:58:07 -0500
Subject: [PATCH 38/47] update python

---
 environment.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 environment.yml

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..9794d26
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,32 @@
+name: cleanBib
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10.5
+  - pip
+  - pip:
+    - pybtex==0.24.0
+    - numpy==1.19.5
+    - bibtexparser==1.2.0
+    - pandas==1.4.2
+    - pylatexenc==2.10
+    - sos
+    - sos-notebook
+    - habanero==1.2.0
+    - ethnicolr==0.8.1
+    - matplotlib==3.5.1
+    - seaborn==0.11.2
+    - scipy==1.8.0
+    - h5py==3.1.0
+    - oauthlib==3.2.0
+    - rsa==4.8
+    - Keras==2.8.0
+    - tensorflow==2.5.2
+    - protobuf==3.20.1
+    - nbgitpuller
+    - sphinx-gallery
+    - re
+    - tqdm
+    - json
+    - pickle
+    - urllib
\ No newline at end of file

From 7d6205c5960507465bd0d2bd2d0715d0736a7979 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 01:12:29 -0500
Subject: [PATCH 39/47] rm environment.yml

---
 environment.yml | 32 --------------------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 environment.yml

diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index 9794d26..0000000
--- a/environment.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: cleanBib
-channels:
-  - conda-forge
-dependencies:
-  - python=3.10.5
-  - pip
-  - pip:
-    - pybtex==0.24.0
-    - numpy==1.19.5
-    - bibtexparser==1.2.0
-    - pandas==1.4.2
-    - pylatexenc==2.10
-    - sos
-    - sos-notebook
-    - habanero==1.2.0
-    - ethnicolr==0.8.1
-    - matplotlib==3.5.1
-    - seaborn==0.11.2
-    - scipy==1.8.0
-    - h5py==3.1.0
-    - oauthlib==3.2.0
-    - rsa==4.8
-    - Keras==2.8.0
-    - tensorflow==2.5.2
-    - protobuf==3.20.1
-    - nbgitpuller
-    - sphinx-gallery
-    - re
-    - tqdm
-    - json
-    - pickle
-    - urllib
\ No newline at end of file

From bd898d0a5cafb61b1632fb202cf7510cea7d1920 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 01:16:14 -0500
Subject: [PATCH 40/47] rm r

---
 runtime.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 runtime.txt

diff --git a/runtime.txt b/runtime.txt
deleted file mode 100644
index 6a40cb8..0000000
--- a/runtime.txt
+++ /dev/null
@@ -1 +0,0 @@
-r-3.6-2019-04-12

From e727ae54f7ce201840bb42d78d1a95117beca491 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 01:19:52 -0500
Subject: [PATCH 41/47] update python

---
 runtime.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 runtime.txt

diff --git a/runtime.txt b/runtime.txt
new file mode 100644
index 0000000..4fb51b0
--- /dev/null
+++ b/runtime.txt
@@ -0,0 +1 @@
+python-3.10.5
\ No newline at end of file

From 340b50b4068a4b17534a5183bf5dc0b3dcc81aa5 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 01:24:15 -0500
Subject: [PATCH 42/47] rm tf-base

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d4842cd..6e56071 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -103,7 +103,6 @@ tensorboard==2.8.0
 tensorboard-data-server==0.6.0
 tensorboard-plugin-wit==1.8.1
 tensorflow==2.8.1
-tensorflow-base==2.8.1
 tensorflow-estimator==2.8.1
 termcolor
 terminado

From be2bfa711e0724af09f90d1d486d81b1737a87ab Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 01:29:21 -0500
Subject: [PATCH 43/47] relax dependencies

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6e56071..3239c9a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -103,7 +103,7 @@ tensorboard==2.8.0
 tensorboard-data-server==0.6.0
 tensorboard-plugin-wit==1.8.1
 tensorflow==2.8.1
-tensorflow-estimator==2.8.1
+tensorflow-estimator
 termcolor
 terminado
 tinycss2

From 8944c1fe204505ff6509d72d1e7a214e2119b74e Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 02:12:17 -0500
Subject: [PATCH 44/47] clean up

---
 diversityStatement/.DS_Store                  | Bin 6148 -> 0 bytes
 environment.yaml                              |  26 ------------------
 .../__pycache__/preprocessing.cpython-310.pyc | Bin 14445 -> 0 bytes
 .../__pycache__/preprocessing.cpython-39.pyc  | Bin 14879 -> 0 bytes
 utils/__pycache__/queries.cpython-310.pyc     | Bin 10413 -> 0 bytes
 utils/__pycache__/queries.cpython-39.pyc      | Bin 1893 -> 0 bytes
 6 files changed, 26 deletions(-)
 delete mode 100644 diversityStatement/.DS_Store
 delete mode 100644 environment.yaml
 delete mode 100644 utils/__pycache__/preprocessing.cpython-310.pyc
 delete mode 100644 utils/__pycache__/preprocessing.cpython-39.pyc
 delete mode 100644 utils/__pycache__/queries.cpython-310.pyc
 delete mode 100644 utils/__pycache__/queries.cpython-39.pyc

diff --git a/diversityStatement/.DS_Store b/diversityStatement/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

diff --git a/environment.yaml b/environment.yaml
deleted file mode 100644
index e1d9167..0000000
--- a/environment.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: cleanBib
-channels:
-  - conda-forge
-dependencies:
-  - python
-  - pip:
-    - nbgitpuller
-    - sphinx-gallery
-    - pybtex
-    - glob
-    - csv
-    - numpy
-    - bibtexparser
-    - subprocess
-    - os
-    - pandas
-    - re
-    - pylatexenc
-    - habanero
-    - tqdm
-    - json
-    - pickle
-    - urllib
-    - ethnicolr
-    - matplotlib
-    - seaborn
\ No newline at end of file
diff --git a/utils/__pycache__/preprocessing.cpython-310.pyc b/utils/__pycache__/preprocessing.cpython-310.pyc
deleted file mode 100644
index 5d0d8187dded3ab9a5f8648d7a7b8c294c4c3c80..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14445
zcmb_jTW}lKdEOfq3lIcAiWDhXR@RqTf+|s#;x-QBsF8KCl}HqA$rmAZP!MNHf&{SO
z*@Y+~i`FrnO77IPo0%r<G)<`_ZKF2Lv@`9r)6TRneQ92LnaRsOrZb(<Tc_?!T1_nV
z`~O|sD9VjH0kvn(p38sk|M~ul+|kjrg5Nj)^j{YKYfe%AjXnl{IzB#&$NLcip$N66
z6zQ$i)VfyG)F@3a>YQg34ZQVQqHY$=I4@aDa-LPR@HT3xdb*g7^G1pzoR=wP@J`f5
zFGuvU#hfrj@-3y97nVrjJtop(1n+T?5u<qT5?PVMdqU*J7~Yd&T<pSox0n!<cu$GP
z#FUtROD#@|Jz_6X_K1hXKD_sehsA!p9}*uEHs1ThBjQoK9~SB*rSSM|j8mB_sA2AW
z*>|qZxx(>|Uv{qfNHQ0y3rIsO^K5kiiSuRYITGLMp^8cs<6i=A6VE9;UK@e0tf)d2
zT7cQvqb%w8)`ann)=>j>Mc*<+;-*URt%NWU(|mJN>F9xWT6uk{WBAFy*i?nSYRIPo
zgD8mPs)o3}W#Jn$^c8K7a#K~6wTDGI(61`;(*Z^k=oryRyY81)9lP4F%dKnnVzuVj
zi>|aW5L>vFR^4g%hi&QjE!nVZRnNEGMY~e<%YM~u*sIQlm$vcqm&?9gmP}^`^AHhQ
zX!&--^@(_wR?!6ITC3d7R+gR0DkwSL48A2z5KW@z3%BLpNwoaMBTt8F`i^?!wuZd9
z_VY)6M!!NTOna?`rgSTg=Y_g#H9}+l-253ys}h>RsbF2hgx9Q9{m`IMhkDIvgl4(f
zbQ+?NC|a1T5=~ZUO2=!}{9?Y<XnBq(m8C3ilw^6mI6joJSZ$PRp?d9>A}7(xolhTq
z37f+^>iJdAT{(I>o@?)@aIPM$)SPkyo9JlEuhzVyP3gp)tu~fsnj7I*fAUJ0t!B&5
zf~Gf(V6Tx;)2gXiY6gGfs`iGZ8mghmDP&e?Cc8g6yapcLh60fQn-Cjk+E;uvP_U&m
zq0`pcRJX7lu?aQ2O?r2<K*I)7d80k)>pxw2-*{6kSnYC}x6Ev_EX#GfQLa0)w&zO>
z4O`G(#^$3mE<;<3ev7s8(yWawVI#LjCoduq?aNu4=d0AK7ACmN?IZ?u`-6M;?zOWA
z4%_q6^_`0Eh=T`0rJXoXIBXvXbr3Bi!s&|JxavrsPxYmyQ>m6~r<QSkLmgcUlURUS
zx#EO-;&OvUF1s&XKKpc-ez{z0IcKDFWkC-O>P%=*$HR2Ld%X5hup6C|k0Y@0cuyd(
zk`q`e?VhG_c-P2UKi-yN8F#tP5I><^+v6wSz(=5LDK{177%JYHuLbI+3XY-k0zWRa
zEgg%e3nStT@i%x!^sQ+fqmyVnit-6;h)<%d39b^=N^UEaLn#BLJ{zd7MfFUi5C=LN
zD3=u901wiplpryMjRcNkwW~uLpjK@>cmvFKSJ+>6Y!6c!@07zf@s{<fzYNyn*!8Lq
zU|clk;8k_^sv~%V+@}3~N44i9-D9_D$1|h3&D<s-b4UpFrrVTb6f;)bYD4Zvv|!S?
zZ#bcj^NeU2EI@69iN&f!%#7OGwwPf)YWf0qW)6LM8kG1@TBfDa-$6Y4@ucw>YF?Gc
zP-18;hwg@NBDcGiIAXMhIC#8<k6|^iszYl?--BzY<8ILDZopC1cqKp8TS+QAxRS$V
zX+6*IdJd=4s(zkVHKvQzJpcb#O}A0o*uHSQVDjS_vZTWhEtxz(-?UotAcBIyOUA1u
z4^b*FRzm(9#Sc?3L%~rBo}_?yZU4N5W6^$!mdK|yI*$V0J_OMkJ)v4j>&`yaN?1R^
zis{x}18XIZqrlKQQI}{tG4sqI;wziV7TD%Z+T&{b(D1Af%PQgVXNMRUe-t!HY&jfd
zrd9G?E?7{D={Nz^wv9tRj0z<@9@Wg4{5{k^O7Agun$Ld!I;xM8EU=%z&~Y|}j(0*B
z-19{H@u8kSpwC5p?(FF>QT8g;YG_uM8m@FCvD<>lgOQ&=JRE5?s&ti#vR@8UC{r)j
zs%_kvPJ@yqU0MY#G^ml#pi<k%X`{q49*@S7G54yHQiiTE3kf~$_wV=+I3+?`B7O&+
zwW`a5gs-BrvV?OW%s{DP_rGIwh&Nf_Y$-e{@YBKgJ5Aqsj6H<2bl|MBokWmeYE8UH
z_9(atfi*F6`b{p|F_E6#Aw3BWm=rm1K$AEi(en*Iv8i5Fyc)+%`WpC74#Z6#U-T`T
zbcR0(ZoYPiF5g1F{r5vFCEV()efr`#0%*<F0#M7cBY?^LDz2Ys*F`t3U-fI?xWqR(
zN)>wSb*Oxo`Fc)m@f48SDS~WK%|@%f;7F=UnLR}4F_65!efwawA)IUI(830FGCJ&h
zt{=jZg*wszulV}slOF0APr(QiYb{4^02Vqm+}*g?t94*96VS_da4**Z;L1x*d(?OR
za_vZj#k??q&epwRVySu+uuYv|j)Gd{dbPIE)(+bRE7U47Osu<d)sr7X7b3<&{5Z^Q
zxGi~>IQ_|}LpG&mDfK)qf+(?I$<I)7g6FU}Iy6VcWHb~{K1XHLYQ&L_;A7hk4=3r;
z^3F=P{;G5qFU2#xhLRq!G9w8nr|rdEF`-%jb<_CE<Ihyhdq4@QaZkIK)wH{ouDxNX
zlDOG@e1Ps0=2OH%yWnn9Rb!Y3et;be=>n(brn*T|8%YL6``FO_YL=UhEOl}3tlccr
z<?E8zKtPyRv_1Y3qu~C2n6=y5XHLuk5ha>{cs|3>Fi~+^4If88J505hi!pPxQR1BK
zvo!`7ElP1A79QTgm^?cFnS_NolJkg$$pvh6!0RDKLF~H666tCNOCY>EhT($=lEAeE
z)=0>gge*E_K7_9>kRsL|r4|zHuMbVlvRijVRnFQJu@S>ob>#+63BgzVqW;X<<u26q
zof-hflD)oMtt`jcG@TG4?1REtEJLWUs~*cwadJGULUfqBhZE9PiCAZJq+Y<dE}t!E
z#cai`*WE@bLe^e!I%3kLhTHJ`azm7*DES*r$15Zx%}O!NU5tbp!-bq9X~IHtK9XYf
z(4u9g-5Q$FXhN&o_V#7UEIEFuoB1`6@#u=Qtlg?%S|6lI9C-x^L(4;aoE~b`I}X;c
z1YQ>?E4Wo}D&WB9-&SaGR&<CXhEQ)PQ&_esS`@;yB>kQYaF5+!N!K^GlGl|*!b6F-
zEUI;ymz8iVbA2FG!nza(?GbQyl!3gAFi6DG^3U(w9=CPLPYG*NeN$aa`sqM}uw<eK
zU-d^sN~E_k{wV5dn>bJC<p}Y8mBl5Ip;pD{l+rQ%Y+#D)ruJ^Olk{_2`9KMhH*xx2
zf&6Y@r*7)1(y)#x-mk8sHvL_}<By?MZd2<38DfKuVH5Yha3`$uHeivyJO77mEVqpn
zu|*3KZ3S2)653-4?8A0%4}W)wG3JKIxt&%!dEngX1GDylJL&;u;P-JM+GM+EZAnU9
zLhoO<NJ0_H9_5Czs{N+6Mz@+;)Xtse?F8<i%{(b5Q2Le{>QFUAyNu+fF#QZk<=EoQ
z6MsoG3GPfQ?MG?f^+!4LOwFy7Yu*V&y2Tkx&b1G|)F>}NWWdq#S{2d*EVgO@r4fW5
z7+w42a}AqVzunVo9Jbd<Nz%fs(#*yJN575@or$bEUZhCDHM{86YVJCn_BaP9kgufn
zQrbR8w=WJl{R1qL#5@k$O=tkX3BXo-YQ5h$^_Z?T8`uhWqg@rCSzW9`m%#0nYqf3`
z?h{7>YB}EQ?VpeooG!5>N&^YibxAjGVLZ&lohpecZdikKIt;7BWSk-AP*9E|2(?BN
zm*z6alH!8|E%^v#j!|%d0z&1XUN1N0ixkVCEx@~&VHl!SZ}QQ{_4zUtzf6VF{UzhJ
zLcJ=k$<HIJm~1%f#7T-H5h3PJ(MBjzeclb(;bs3LWOz>@Fp^nZmn3fO*E6_0C)9nq
z1pzyw8)}vXZS!tEkwJXly|iKIX<Vst1r;mAgt|X!VgwH}rtcvHW76?}33U>9OoG;(
zB!NvFYs*(xNKm$Js&cOnRz)!!+eBc|9U~I&=<n|7q<nKF8Kg*!iI^oUfYEf2-byV2
zjD8FE53_;v5F7Z+5F4O2N4Vt-hoevxroqbAJJ}%10ChxUz#MZy4ymi1d@#C|3Gzfa
z$Zw4Tx~c?MS#<Y<OfN5q7?g})J?96@ZDqwML<DUsCvu=VCXB6oq+AWgBG!sDGZ+I<
z{dzD4`k+DdJ7d8JD2_#QKx)`{4kicYU;?U69(^#jwDZ{ic5v)qfGU8nxprl3PE2;j
zgAt7I&)!zrDX>HB2F4_IZyFu_6{Rt1DE|1yk0SQz?-GF0#75VCus&FN3Z;!+>HpcW
zbkd(7%3!>^P%=w&-%~eoJC)_N?vC^2PAb?XCRerhR9Wn;I**tyC@7%cvi!+lbW^2y
z>`c%u2onD802duXi0{EDm)tvCQcUgv>%>@%C0;Lo%AXD(NQj4~z)<5W2m2E<m1tx*
zcf6k~_C+Od+4Of=>X`DL>a7jU^*B=)U;BZ-C&-}w<f?&@{Hs5;iOVR+VCFyFnH(DN
z<2$Y_1WaKBlRNh}n0#Pw{(7i4@p#@-*RDrB@S(8xr=z}Y(q+4!D7~o$P*5z=h4QiX
z1-=s3mtC;?X0_>%UWZv_UrJ)E8DENBTy^&3VXkP$zw;*}ghe3wPL>&|Bc81;%v8Lq
z@{34heu}G#SQfQ$M?HK;4Q}aSN}x@MEsiXv)>@>5gl4grq$uu$TV^qNGLq6l?d-`=
zJAbm6xJ)uy(dvo=#Sw^KtaAUnvoY7s6Y>?1cDnps5^!o8B>mk6qrWFl<NLlFm>C!{
zSl#hux`9*C4SZFV&<DMY1zm%736rL^6JPp5=XL1KNPhwa#TC3lwGuI2)~PV#O+B;Z
zG{D{EiT@zFbj53L>{Z%N5JP6crqr9{fy=mCsXf61iAV3D`!t!MS}v%PB%ny~!^#{I
zorVTPgXuzw_ep=lkRlBgBvASdD7?UlFsKMt^!t5F=dtkR5~|)Um8G~SpN1q~Ly!mh
ze4il2KBax&!-vwt6%Xv3+#s@jJro&yWYA0NH9C$)We4eZGzd+TsZ`2-=ExBiua97w
zc=g-IUpe{W+_|~uX6-phUeT^$p?|&X*{~5@1so7P+(BwfyZjgkv*`vo(MF%K=OcJg
zFE?5M3>#>rEFp%l<!9Le@Q7)9GxoW&_Q|=^_QkpLS18U<2>l^6!_*D{caW^iSSV^S
z(&Cdjh^#*q_ed-alX8GG4O|Z~*o>&(ya^5yHiAZh@*{e*f4aJf9b53axjWIbyJMTN
zFE2Zd!{`QZQUizy0E!yHzNNki$6f$RT%9RAB=ea4P;<S|AmJ8js%t=vo)olLjAnG&
zp60rwvmB1T@Y2Q0XG#~&&7He&@_cAT$?m!=Q+ZM?N0=!E@B<|Q>}9NI0B|CitQjGt
z(IN3a9DAv;K)~`m;a0XKzzkJ$VE!S=K1?~#yEqaEp7mf6;vyAj1i&6atgt&w#)v*b
zSZ68QqrgWHCRy|bD@wEC;uHzu#Y~(y=aL>K34A|K6OhDkN$8WCs9DSn9#)hqjtwSo
z)#3<tQ};NB3Br#X#Z-MGmO{}EF|N1(D{YJEc&E*^>H|s??o|#)FD`qIv`3)<B~U#&
zN_>Er2kqM7fc`W3>wO1-VH!}h8&H?P9;T($yfy*l$-cX$0R;+hVV2S1IAR%fAMz}u
zpcF!2!n<i*`%fbe#0WcD26<4p+|`W1IF*ai^9E7djgmH0GZScQTGQUx|KoHzt5NGj
zb2t9>e~{s}O;X*Uq@~NRqN5d3VRU~aCXvD76dvz8;B*2gGsc%mQf5N+p9H>y<^o6)
zG2ly5GWPMM<tJ85AWp)UiIpVd%T$osvOWY~J~edHsw=6jGzs4Csjt7+Nu%D#x75xE
zP-ZHCdUOToa*q;>fXmLWooJ_f=}?LS^<{vOG7&Z!$RRwKxdPN6a-cBU8AUA5v8<o<
zbA*A!m>4G!8A5&@LNRcrF#PeAT{rcc>ed9IM_|winDO!MjB>5)N34~N&=%`{ei-*<
zcPGcRavzaa4z#EaDUW{`<j|wZfcgWy5>Qei9^cP^I#bJavhBT?&#C?jU@pvfPP%Vk
zES{s?JV$w~1e`7CPZ*v26@}NTOJRx-`cR26tPhYr_1}A*W@>9%Oasef#dusQdQwRd
zdfpQxg0Ubk_5wYRf<_LQcrP$<ImW~fb<sA^+Vks5N`YJw(e-JTcA)$25Z#ABH#<Q0
zogL`<MAzRJWCD#4UoaX#sS>0&El35il)xurANCPRUXMUNd%(#EO#flpfe5jJK|UA{
z(h;)W0VDrl@T{zz5NUv>hud#|nDT{?0^eBM?lQgrW^8ZSZPlPqsCkg@p{pT%GT)3I
zU#zE2Uy8oCiJn^KYy?WX7}d51QL0=8|L8+gS)<Pe1ug|m3M7KLLbm-mT#`0!M6hgb
zPe|w(pi+Q>A7~A>8oOFwDA&r+CeSV3a9YxZ@rSevP=LfZlu?G)K2EgaT44B*tH@Hh
zNfn_2c4UK-$~Dx4i3=B_*M9YdRvGs*6^3rZxe8l)RIlHGg(|QDNC4DGx8c^I$*M(`
z2B@@%h`k6cdQbI?yB$}X*PeRqG5ZvBp=68Va+lm!mT?WWU;fhPT7WI*1T-zbNFc$!
zgyvh(e4lN><AveJJlRsBn)VrJYAW1``Q>WO_1xz2#;kqPW`g!SX~;Y6E6?$S0d4gL
z4>&=?2I&f>4M@#}X9@SeJBOr5@?3H|u)9fh2hI?}t+`7ZKtxVWc<o~mY5O!UPSxMw
zE<5C;6YUa$GjotI7F>7LzItrt$@Y~O%S@FT06|Q`Prs5(lUK}xQi`XTP43a!e)QfP
zYkz8oxsheihLVcbmVg<Ll{Ow}uzRc4J`s;yR6PJH8av8&XP7H5SC^KLV029ov{#_A
zgU3w|$hM!2>mRJv3j;g3O9AM4pdS@?seu*3Rvg^v?W1$Rq(`D@q#jg&cReWI;_bov
zk5uPa%9g~C+Rx2PsPHg8pk>epuI?=Z)#nN{X3v2xkIoVe8`2@VLZ^>*J9{<A&(UFh
z20_szogmbNK=q1jQLu-Cy_A>g`M5+DFZD~I+KD8b)5yO(V1~Me@4|lhD^&Ll3cgH1
z59-P{>637ue2apwP{6?Umk=WZ9Bs&A5<A|*(Sz>;%otG40oz7`ADg5jJz~n2<%UPb
z>@Zb_vl)Mqrx*GBgd@#XqvA)WJ+j1+GO;+u1S3a_m?9axBV2uhVsBIMYY5=Y1!h^o
zN;)3n;=(TZ>y-I21<zA(j*9JZo3x1mtk)p*oPs<ct8gQ6r6(CAlPO951_kd@z=Qt+
zVsInuo;NmXFmU#GHTuB1sMQ<MDzRD6qAr~xIiWB4F<edc;OarFp!Z$ShfQP%oaPOv
zDzloZ0g##ilxcv;CkTArBjx22S~r#f<eUJk)b8c+WROnawV#rulKO6_K+St5K&Xa1
zg1wI-Mv#`ED^l-j`u!=pp)MtxqXEcj-rIi{*2TY)<d0GzRYUhjK$U<iAgbbRVLUhB
zSFoyi)(t2}VOoP~QouIyrh46gEe7zEY%#3nvN#1wp<YVrI+Y>^vwm9ox{i9ufs_$W
zu?A8y*A=8`u)Fl@jfynXpt=}=^}ogT`-5+5*EP_}43x@o3cyaU4|#am5S&W`&aD8h
zWT8CEA}t@M`Q5a&^MMrr2nQp89-~WUXUx|`4wRHg9SQ)Eg))lhd|%`{<38+FpfR?p
z$St_b8G(tfarguzQEFGLtQ#kT<F>qsejceh2k896ePws|%WlsjwPyhOg{k&G^+8G`
zmh-J08Hvcc+Cv`Er8;Ejj4H%FEa1ZM-y?m%*}kp(zP!H0d?@RIM{%gs*^@(`=ewV=
zj*drir$9{*V?JziM<{AvjE{Qc>2r>ldAZh}jrAUps)1D*q$cnxkjMQ8Dl!<M4e57C
zH$>-v_W%3Dm09_;mn`YD&OVuu;khf6P4>N8f#36{wuF-aKO4AJ-W*a+6?91^G5IP5
zWRtmNvR@c8Pxkf;bIc4$xe~L&uTfcYTeubJ=H!<st}>t7rp)?n9Lx7fvyoLau!l5Q
z1sI8*`AK_!pKT5(=XTa;4USlDQDeH?hgZMX@+N(Ln*x%&xAo$i=!N_Pq~AA><Tb?e
zNEREO$K5=SP}L94;|+!94}4|&{K-E=#aoGOb0)uq)aX8;K0${`>W`f!=-a~+1l~v!
zq-8$1udCf9=Lz6u`-8^hKTi$xl%uRk{4UWZ+jmwWw_M|+)0KaGKh^L3pOY!c1{U@3
zyA+G3z@zvM-TZSRc#AR-p}G2fic|1W=ipJQI6Mb`L&X2&qtcS!yPwkEb;tc+3-cw^
z@+J`QPBLjHk>BKi8n^(x3G%y2?F740M_nH7_dlXFHY`s-^?ijsNaZi1X8SZ$v-j&}
zImJG7=+NBB3uo*L=Pq4>p4PtjqCI~R(Pv3}I(O>gh57SmE}ucE_O2ahgz8-Tc$fN5
zPYW&_kDy~bE;uZ=>V3r^pb+t=J_?fGpikZid>|O~Fv`hxAq%^C9V_OBzsuVb7K+0)
zj&Cb)e0z=K3&j-i(n#kxT=^5*Du1H9jk+g)k2+3EgyHu4&l`@nG+{ip2g~0@>^iO9
zc|@#HU>uygac$h3NQ@`OjfsQ-Um>KZ;|6&Rk!nwS1B?;pb1yJchS8C6&ls8hZ^JU~
zWPqRyW8yBnR*3S@ks!uu;W>rJdlqeye-#^mp|raRjt^t6>Kj`Lcw53SL8jwCgV8Jr
z&l7#i3=C4<t-+)y6a#;odhGM@@j(T{5_a>F2Gd5OV}e2w$VL+ismVa;De_@?&Fom6
zR43gT>13c*OM-HWD03{$3PPBhfjRV5#rq=u$4I&xhfxkVW=eq~uM#9s4~Ahf?6JRa
zEik(&YfqyVp%(IU`Lal|nb~4|WRb!IDoBg+@lLWClFI#-NbOh-W=g1|(qx<#BcPE(
zUEnUH!KbB=nTs?R2QjD7u_NI{exCx&u+sk2rOOxRXS+s>*kyxuYtK*uy9ES#X#J&g
z1pmDQPFGITK0af&zwkVK7a$Zu<RV`!ueG$~;J;vy_5gMP<^^m8>A9BoeK&fNB6&!V
zXvX-V=M@5@%}d+*b+rmh3ORMu;m+YhvW1JufN=xb6=^&5CJB=8276$iAHPWM(mne{
zb&*;=0NJ!VGS5DKxP6diOLiq;t471MYi?r+nwtnMdA39UH^G}Jq}Z4-Ao?AmnSB!h
zIdznw>|w)2->IXJkK`#5i=rd!m0%huT8xZJVgdh$#$AVtS#PKeQ&?D{$tsTbkVZdk
zEH(<2XjV#uI*KN!l&YebiR8$rKyid|PxKM0rDCc_teAvTNC_<zbL(Z<fHYZ(9YTwf
z{QqrA+mg4j`7lNGdC@~+7bh@2(%h5FH<-Y^eII0CKY+U>($ebVc*vJTdxJzRa_1+p
zts67=f85Bw014bMFVJhhB)5^`WOIXL-)oTXXE<`4{GB5=>0dyB&{(Ru3sj6eWoF2i
zZvnnPGt~zEUk^LsMN35sRGdJMd;T;tu}2^~@X1Bw%ip0wFfBXefK*JtT-4l<3zS3u
z|A*0J1eA``=ff1(6cDFm-%0k)VJ8^2OR&txY>$UFFoC_xGPItlyP{QdPSD|G#~TZ1
iho@&C<O$QPackO|uy$D~{OwDptTC%#y=s**m;M(+;a0N%

diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc
deleted file mode 100644
index b68df308c35f5ef863048c99f4094963f4a58c53..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14879
zcmc&*ZEPIJdET$v`{40-q)1VeWo7NSmQJN8OEFTHbyUmxu<1w?ZOI?n*ti~VmgJFl
zx5wE%QataqiRnTzY9~s8wnpowRMIvP5EN+8{wazUD1rj*p9V#K6#FATTA--<t7y?b
zLSyxL-`)EnB_k=^_Nbkmot>GTnRn)$=Y8f?X?QrJ;P-w1+l&7>uPFaS5B)zK56|K9
z{t<ytgj!W>x@%RnrrDYrrRlcL?+n|(U9To;rftUWl6I2crR)^$Mm1f_*qQj<kUhlj
zvUV2tM0NOTRA0^>5vEAKtJryw5^3B=MMezaJ|?na825c5Cq{4|7kM#?`-B)1`*7bc
z#>E8glj5^tQXF_!wGW6XaS$m};t_EO_k-e5aTxbU#Ak$s`yug|cpUdfg?dFPJaHHO
zROSn6IC8P%JJ;u3;drO6I@f(9nTwT0q#>4lzOsnKg_86fiD&g#S*3#UFM+#>>ntws
zml62Nswz~W1sI(vWm(6wCX5fXwi>9b`i>zIw^fSoB!r2W=9}9}TMx8z%3CvS!%qgr
zwkq^BL(T>U)gY2<8shp+3eOm!U)83R+p4Op&x%Z-ZzysjKyL#5iXzhJ)ljSEm)0Ds
zQnyOY>()}G>R3yzw9pw#xaDTesr%EGbo{2QTh)r^TkeuouJ|Rt;?}J-XUof2`1va(
z-zrJ2X9i;tRkYajt-9+|<(;~06E#<xrB<%I;*{4=ljF_cS<)cUKzhD#oBq8-(_cFN
zOsHn=smJeXcsJjA@%X3oE2P7W*IaB!x9oUcsLN(OG#2I-&P$R^XbPtc*@g+PQLXr)
zLA?(3s#6cmQlsJ2MIm9QFjz$ztk9H>*R1+>zFBX2jwqI-ENvBKY11AXNLi}XOVv=l
zen*iL_{zOsIPoebh<C#CE1tW0;#@q|-U;DsoG4eFQXLcNMANTSy%P=T#Eq@gmuDJV
z;b?F0iWsd%)6bz!Z$ARXIH+dTjAp7S+_UO9?uI5O@up1U*!j`&HE{7n6o>?vhL}1t
zzT&Haf=Q(bohHw=x`PRcX{h0D(!H(KPa!u=v^RbIC&+F1#ye`DVY;PSr3_D)*+xm0
zYF52eb7n2im*^NKpud8tM`@ggCKmmcs-@*w3lqY^+bXTSsETM-&RRTL#cr`M!Flc`
z(W$#XzJLFID|cktT9B^qlzm4WJrXLd#F4_ZbtKeLX(16FD7*CyNBX?3uQZ%;rBpq;
zf)yO<Xj+)W@K;M^C!C7&^=G;2zIye-GhybnQnl%vm(rC5Jv68>p+OA~Gri`q<fDK@
zC7(cG;qne4P?9MK%eZgo#*fWJU)+TFDCdA4p_$s%BOl=*P<E8t3Tp}#cg@!Vbz6m|
z&{?c63T;P+&~#x$8X<m$&WN5hrERnm^>fIdfMDy$YeJhurINdH9Y!t#xxO5zZ)#jF
zixip=?+3~i#W$dPv`HmMOk(=5*ee5KuU6_07yE2SuKtQ+c^JreeoR}WPc|$53e=5b
z)ha?jS<y&CKh@j~NAR?`OSAf(YRya9zwXjJXXSF2HB3T|AR*KnZbObz%vg0Rb$J-k
zf=O$=?u0tlFrp<ad$k@WmMRi4Gy2{x5r+Bb)0en0^JvR6sL4BkKrvHTdh~Y`*I`^4
zT!t!7;@yBO2lj;b@wOvNEG?3u7HBNP=O7J8bwGyn+%H4DZ9s+^%W$bHL(0`JL&|j+
zxk!fBS%#F7MGDFAU6!FvI}6M1;{QN)ZoRs-TTCn-`7`LCd=fz<AbEtINjCB*f`Y*U
zV%f-Jl**z>$e*S7GzBvhoS@(-3P_vw)>Jqe&7MdgK1t*vGDNB|mdGx7oWlD7c~E0$
z+OTTOBH}CC$_|v`Z5j`?b!>1vNFfz*`LhFRi607@B&8b;vq~!ZE@v#Lb|y|hv0Xio
zkD@>km&fYb<mZ9*Q94a>ZXb<guYGOR$BGr0H&EADgF?qWAq;MLqV>c;%OBEaTc5vh
zE=-iXa-|ZQmF2oC9ZAZxU~*^V=MWEvn)M28lcMC8!Zh;KO4UjW`=L{(WJw!RK?@D~
zNN7;5-TkzXV+EH-eNoJ;DlOaxHmMZ7z{NfNXG8>6gV2^quR%Ah>2iVaQ?w?QG3$jH
zC>6~04~#bHpcFJ%8kY)uv=BdM=oyzWML0?aj(WeH2ohY|RosWB6l`?Bk{CJtHs@`d
zNY6bWJqb;h6eG}dCTTjV&o}(Uwz{Es|G;sRp8B4XeR0#r6FtkTv|@h_T6-Na%&~my
zF9#$g+{&zV?(!UguSRnbh-AePz*~L=n@cq7qW#pb_*H0D(vKXa0$nvakiVm3J*T>K
z7AWj2!Lg`hy;)myBo(EXU9{&hNZk8<>u9Aeoa<=N;ubYBTIRf>AA!h19ch40yt(rl
z4|VjXV1$YFrX#li`J5_tW^CJ)8t|40Smis|b!z};rDdlz?7M!cdOX5eUYI~*Yo480
zu5193sWYrgP^wg`RJU5%v{gujT3LpPO;@gY@-t{cq*6%lh9g^UQ(hpAekN*=MX5PT
zy$FRLB^FZh3zVGTF|>yV#>h@aUGd}#lt-;Zy689_cFpi`f;K1bf^=(dNN4FvJksmP
z>5(Q=l7?n#DfOV5SI2QrsR!_v$DeuM#7?2zH|}d%?qf~QsFJkU9xXsS2d@oMogLt|
zt*SAEgEqkYh3$Yfb6eden+&+WV6;vS%&tbM;mBeK(au_p5^b?A83Y7zS)Q%2R~g^-
zX2Y!2(!Owd9%v`g0DSWqVup#b+pPOo__@Ja?U5J_SL#K6vwN&Y0gy#8&ctTH2k4VW
zt6xcEF*b4mu`s!ai4NdApeA&@=_<s};{ghRu&jmdLj{sKwFAXSsFqBq`j`*Hssolt
zt<x5Zfl0J}YhYkj+?pdQa@L}Vg&3x(E4O$+2$<p*wPw~TbwI4|Q~@HEt<9B6c_n^L
zg9+onIx3u{5{w0_;;~&6C&yhXL<_mIFkwX%No_{U>1Fil>V<-4=gMxa=GKc5X7=m@
zk%BJP-MZ(O>Y^k?(cfw~ULhfAOzaFdF)~&R7xs&!;R?-#$U@b_6bYGTYG_8i2~(YK
z?-o*a+3|~=m%oW>Jmv^en7KOejV515+JI<CgVROd^qz)NScaAhlvV7fw-xB#58qQr
zFsnMu3`3|ll}U(fl0-ttmF(Jg0_<Qn*@pGao#YK=i4afX-4vBN$08-9%K9D{l+Y~2
zQTKl6b>x91j4(*VcJS(hzsKLo`Du~bR^L(AbABe!U<8?K#w}Go=?{st$n0eOVU*Uk
zv6|4%A=3FOO}3=S(s#x1q|!G1Twsdaw)V9{?W8}llMj?2c^hl*b=c+x=IOSsD)q6G
z$}QE)Kc%SrcMYLGigF{{T3e}S5Lcr(VKM9nn>-EJRPT)cv;}cnkccIkuvE*yA(1g2
z+g6`6d%GCBQyJrGn3B8eY9)`%ojWpX9l56-VdcF?2hk+kN75zGyA165R-UX7p-d?^
zl{M|_+PZ-rsJ1qDj;9lJgEsS&97pauYN&%kh-MjCL}BJxvbHhB8>jyi&;Nl-X+2J}
zuGhzzXRB_xRP|0H(#gyKa=vx+)p}_WMgbO<*DRBDztpS(h(;j2uWzl_=IRzHeXGk>
zOk0~oeKawqG_J8p(JNzt36T}Yi|7$-uuE>W>Tc3nkKX|Q@n+Oo&RBD_?_!zLKL8?0
zs$<$}z^;cLfKu`4>%EUti)ll%fU2-7S`~phD@zqH1pK{Hwc2@woy3s<S&lb*_rJ)x
z%@o<fq>hB@rldW$FcxOxMioT`d#gcA4MXWL8NZMq^%OaVAk^v&Y?mvjmgo#JrQ~Du
za+HEg6c7>*^;)SRU!hnQ-vY3U)rBFNwFWPJY@DxA_N$aB(-RqgE7U9Ey8I$u*~z-I
zNqWQ{iYj956iox0%JW>v4T}BSc;OND$F`~Fuu($c=!Z4h8picQYDUXp%gkZJ)IK&7
zc`zT-dRD{6xrP#DQa_y^RX&7^Rnp%ggi6wJLH)E7xJ-i6?IZzAEa#N3u96X)x~<Bm
zeJCl4VX-EH6zwb`@qzxeQ|+{Gt|o&tQIm+JL<#^j6J&PM%K)I?#ID23Khs_Q`ICy*
z8d&`F%_07BmcwDte;FvTe{bi49D~v!k%d|u2}Y0_w)4U8PBzF>%|U)=7;sf3h{`72
zzXaLtyCh;D4xw^B>d&{66T>hIw4D(#f?7v~v6GJo(_l1Gsz@_~(G936`BX59FQ69s
zLVGkALY<@0C=j_CkHWFOQ5Xk-$^8#Tm-qHQQho#dhw4)SfX(&qt$!pY+GD}cIehu8
z>+dP8F(^arCVD3JZyRm>b*28eq4;CB)UCgdl&8N>07k3IHI!KY@S0NBp6V|(iBd+l
zl(v=ngi<+woW2Alysxjs?88g&Jm`$TciZV;pO{$FZfWud-5J4S!P^!H=XX>7L@>Oq
z(ipeLX@&#|e}91O4<QWhV3>0r9Ly;url3wS0>(1S(x3DX1TZYbBa=|F@iv6HixEuJ
z5AfTu-dk}f%7G22Hv>rqn)h-491O;|28`_bhyGNMMGX_!{&e|Q{^T|`s341(x6z&$
z=<|yYlsZgKVFVKoZgDX2koM@_R|nb?_vn_sJ{`5d2Sx86h}yPITkv733EU#ciX}pD
zpJ`p<jdF9v#Wv8WG#p}dScUfNCdQxfR@p&Y7tTy`K`Z`UJQLwA0@n9(tY97Sd}VQ_
z>}^Q0&RNf510z*N-?*nv-&2D-dYBgYCJY%z+UfNsk&a*z?IcC9SKKk}<eA8B3$+Vp
zLha%iJ8_k)HapcZ5bPlsWK6Js(b<}B<p~W7*gqY;F9}e!46+CBVgtA@&*6E`zRc<k
zFT8^Yuq%&6yYhxA!3Mnsajt_^!jNgL#M@tLzXiUGm=q8e*KiBfa-@!#U14>ces<ZZ
zL+8uW|Aq|mEd;G6Nky}fQ|u0J-}c-o&>H71#QpZra2m!?EfrKLsWLVenbILMYM?9B
zjV{C>o|GZRIA}+p_M1we!F)CXHG&zvHs8^?AH3a!s&_|ai!ORk-H~r1$m6?FM~-}Q
zN4hxU(`cG((f0!ix}(d@8S6CXo(AJ;BArfK&mKR{M)+|IAkT=_sn^fEGCw!}!mKq9
z3oV*SObu+7JPW4;8-Na?i&2Qgv`U{v#e8^!H__yuu@)jIQY+P)02f>MN=d?C;j^C=
z2OuS`+ncfGE?8&g&smq}FJ7ZK!zQ$ckPp{(1Q>-ZYQ{~`7b7O04ut6Zr|ce&ZDyhn
zh>zfMh~d<T+Rf8rns5|&2YMgXM>DI#U_9^(UgzzD>a#k1o3XC0IQ3~X1DL7~bOo?Q
zAHi&<wh6~t1fpD<DLf+ckV>ezUTBaJ3^MCF5T+*`5*8yLohGiiDQSI&!!N&j`Re)N
zrMdaJOJ^>IsVLdql#7&~h~@}ir2&*cCE&!2=?H)<vfdgIsu~_33Bu7=>x%?JFA~P(
z^8_43RUI4v5d#pW9k4Tw1lDJ^EJ9~Qkw#$cF~kb{!(@#8BQ$n_Ui%a@5rj!L(V?C)
zOmCbegWb-?iSsV8HAz?k3p4;pbeGIS`3_3jBmK)5`RvjD1TJb1VK#LZdYB-rxn-wo
zTd}>0?~pQ!Gcc)ZXX2SQ->mgfE!?UREXz3W9C1vc0jj7LEgN1yteN&bVDbI{?e%^g
zfnplKh4hCLF`&ha2C_w+#`zfiWq>4!m><KH)ehkerDQ<C<kV@T7>H|lOS!UIe=HZL
z<+;Ru<gxCX8p><WAZL!g#<lEMp9U7?uiYm~$JFK5(W){N8`M7h$b2G0`YbN*`_OoV
zLSy`yB!UyPe-ilfabE!vMGW|ph{qoOO!<jb6Nr@XXJR$U_%j`(cT%5#KUW6!T6HzO
zlOdD*mj2dr?F`BdeN$}@0fnXmP^POupHoUO1Z}&s9<;{0=^#ph39~>;ha!~J_lD47
zHo{{;R*ayQ;r1|Md5-1$oIgTXNQ{axGM_;K<Y6!aiweUZTithCzpd_!6TSpa9mnxK
z?(JbNmHR16<s#HYtPt?*_u|&<Z;x<oBR^$rBdCqafL7+(M$oE>fZ9a$YjP~^<EK$S
z)i%=3wVuYfPWB{$@i60Y>D+;~c%1h0IOQP`>~HiZjCMXo#U!^*Et7=h2Xc+ZvVvtm
z{qS~Z)OHSt13><e9`{d0|9%8aJ{2T_(I79TfUbvQtbFj6w)MjpD?if3<l6d9RPQZq
z{YSWVV9!wfx~I^n?Wtdvr~B$Zi28Hg`t>bvNc89T>ettS{0Z{|C!WTo_=f@<Wf3L}
zh69i>L1sGziz2o#c;!5TIYyS-^RVw8vT_2`f0Sk?LP$2q2V+4dLf`#3dOZM+*1bRJ
zUrXzO$N*R!YQ6VKqzz#T-oJM3Y`h)JSl)`;tb&TDdSC{?@el{i`=rO)?Adcyq9^`D
zm(IBm!PX9nwXA*|E342iJ@6`P^ypI1pn!O3=^>ae<XSIb%e1gRLV@%5gakVQG6IJJ
zKy^6Zu`0F2Qndsgg7);f)08d_jEI*2brK_0#vESj6x9`%g8GkKN0v(sDhT4(k#$Zg
zRZ$YhV%R3V)*CN3OW3U`GguL41E=dzxn2VnD?kpY0uV>Kb+-lvtQwt8fC{5ZtR*n-
zUBVhWJvKV8HTmXet+U`z>HLcGU2$Ju!KT!D?d_ME05Nj{9L+BhWU#K_^UY|yFErr+
z!(e2A&R?RE)_HI|Wp2d6N~P+0ZewL@);eQxh1LS`>h0FG7kI#cy}F$TxS(#M>N186
zxXp#93Aev9hD1SmE;%Atourxr_Xy!u-Q_LdBd037*2$=9>l_QG;%{-29dg=<W(k3t
zdDtL}uDfP!oSb>8b?ucB*GeBig&2gNT#{TTOUwhE#Y4=;@sVtQ`qrFm{rm&QMwU=F
z=q-{h!8GnGO*~@0yHabNj{7bu9^e)A9r-&W%!OAf%PYsxy9O$>Rzdc`1E&jXThGVk
zk5+1hzM0&q0a&`P9c6d94hdl@_RsXziFqK@<Iylu3(7#h9%#FGdhq-s${nI?N_469
z!h!_(hyDROqkf?5t{A917oa|S4p=~1OVn-Hk7x?5KAP?9@gQHK#riA)+axv-1S2p$
zoVzHPqJXSEJKgnqiB8GXw?nlZSwdtp_MKd9;jwU7l1G~S6$-vgK^OPRuh1ibK)Fr9
zyA&|WEg?o{b~GRDBqqFvl?S&7oPL1S1JI4^Lq5fh2*|WAOLdRVzr%D5zGwVMj$h;i
z6b?0-^|Bu=_UQ1ASV()6D~x<F;u?R6+Q1<D2E}et@E!s<cR^PcAxOt#kX+a&e~n(g
zM!|~|%u%)}w?T6lD0>|y&{-G<vH}kiS9+2W^ZS&O4x!{%Dd5h32{CvUc9t8Tg)oZt
zSQtH&ZELl9BqTmbOi`20lF`5>U)Syn(_QR(6aw_VgZc@v_9NB2n#zr<;5C6xO?5vw
z&5ZgaISSFGbz=tM)^RYPoX+o<5q(gqu7N*|Yrs7PG)y>`>d&Z;_i!#S?7)+@pHVH`
zl+^wA<?o`cW#S?_Kf<wuYk^=DZwLLk$<9Y!{wBCpoW~uY^El#bZWuVk0eYoF9A?5&
zoC4OUo07gkxya+JmzKGqqg=8tWr$N!eJR-+3eq$j*YwH_iwsy)T@1lt;9vR3f8jmt
zhK9PbeYr+B1=y$ChCCc_2<K&R4D~~xmK^xC9MbY}+J~LA^$&wo0Gu2Q0eK8Bo9$6w
z6C<ceiP%)&haA`{)C0WurpUL)d>nP5meDohSD%L$oe`LL9)l}E5;^xp3>mn*-EYR@
zH_s7EI#A2aJ$d)XdB=CZA+|Ia14fx_{ap{EL^e8a+R@n(9m#fqN3^F7oR3BYVz(D4
zVL0>=IdGwOyuRmM?~r}q)kBx!K&}gC1|BbV9%B-ZdvdQppCG+FZE-^=YF&<(dgK-~
zN4mUJZOz6cNW?oZuR)xFSB7!!Ka|yAfHpt`X_)V{4ru;+q?DPXdYX>kNy<K*J;Rw-
zDBJ9ow+dJ2ZEYDV0j@dlv%EdPVHI>qCu#Bx3g|HBj>#@#tUTEfFdSiJNK8wt3V(z0
zlBdI+h`y6=Q(R>|w~J)@kkky(962m_sFM5eGJwaScE5GFr#AaIya!WXpK6c6)g7V^
zbbBm+j~;JR@J$N3>XK=)Z#=9Adt^dRODvP85n-83vrKSW&@Ypl3d;gIYPT%pw@~m-
zV%PY~9i&Ek6SWbPX_FFYQ`yzjZw)q8qrujUkG1=In)<0tA8vAQ05J((+}p12P`k27
z&w-oBfVcQRZIk?iy)F9g;7IkR<(`rHbc8B@pV|<MkGZi24AUPEHmWB+23wH(H>eT{
z_O$YUQJ#<Q9lbyM|CQ$-?rqUu3=UmSo)32-xPn@|aRj_lv$Rsl{k4zTFR09NcFrgg
z>;=CIQL&2n8+@IK{ZPHeZosmQdRynfsPCoJImJ45?AZL7OXsaib62i_Ubim4Vl7-o
z^m!t^=g(fgv~cnK)$_>J+V_AuLUq1%s#E(<UKf0H9;2}%hznoyX06BP0<_Xj(!;Cr
zcj&Q8<4T67j7M`FaLGP=C+(5J=hE(kMSHNssa+XP?JjX@(N5E@7g5E7g+I5e@aIaq
zYWL(v)Ns<_=x(q5JdJtLvEn2qSN=AF8#HssI#GsGaD>yels0CLCB_qD28D!;Q!2&j
z<R99LkSnz6Az)J4(#F8B9t3ZjQx9t68rL|mIBDyn#d#K&cLCobw>duH2kUwpdooV*
zRo~c2z=akkOynC7XgK%G!Kp~!F$06x*YzjC0VxLl*qMh;^Iz&`fR-_bmo=PGCE6xx
zNCJCmLLrV5HLZ|)(wk;G)lRoF?V)xSoM;j?r>SO+WticGKLHS-zNUD8j{oM9>BMoa
z3ZyxyfQkL<Ac1l?`KPm4cKL1uW+!F6fl`DiBNrw^B-xW7#TYh4oEBKL6y?A@$*0Lt
z&KZ$@U{0JjgG<bi2Z9(vEh8umbc{4|QOeFo^wLp`YqaFp!u$?BVw9EE?3JsR7iK%>
zPO(Q0&D8GM7LJ<0y5OKjI>+&Ua^O+tG(79njMe(mi||c>=?=q~Jn_8d^0I^fI71u{
z&T?1{@X1y8KL_-DbGr6CdBBhX%`mU)^@D@9S9djRqk_{fa@wiEUB?Ii02iXZ11WG|
z(sF7IGWX%@_n<~|o1eH$F7Vwmr^*t2`3M*USihaFO}CCR1Hi6Ad=gc6t*TpJ2EP`8
zTF-Lm|CD$$g)|>P^;ziTlf!m7Iqj4{$?;)R&uOQSkIY`OUZeHv6>*$nrx?H%#UlRa
zle-BA!fsa?O|nf(gJq9(0bws~G(N~H(x?;(9NH#oDOQA?jm-ThgFVFXG<pctqMhzm
zW+xZn+K4aMBby~zhtXe*U488d{$EnXUCFyNe3+*4EcB3c#WU!iLMJcK04ent&Ug$p
zk5eAf3{T*qlN+*x$-y?&7o!yUM<|PJ<z>1ZmgE*>Pc*iOoVX7DWricC$cH_0eg8Z%
zgvN5!U8HQ}12scFnv3x2nyJ+B|FGD(G7{2PsIqaiwd*N06MK5HW2Rijd-)+{f^&dF
zu4Q%thkK1JS*ADiKeia|N9gkuJw8f-MZrZ17APQ>OLlVNb0s#)S?h7v`i8CxqQlg)
vHCHsN&S_eg9#u?=l!m82q`^v!r4FRVQ~OeB{2fZAGoz_O>Wx$}b>+VSt7f@x

diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc
deleted file mode 100644
index eb47eaa92184637f15d4825f0927a31059d9e014..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10413
zcmeHN&vP3`cAgmwei8&Jiljh_rbd(~5fVs>lGm2K@@8dez1hmju~$wQPbq~Opc~+j
zgBh@AK#Itql9V<%MHN?a%x)FAx8%70MowFm%3&@!Y%V(FA8=LH`CiWe1VLG~iBpxz
zmO)NW_v`N0uV26J{@w$;$;pC(&%fOHkNW(5Mfo)qMn5SOzK_TMJ0whDs-u{yd~0S(
zzI9WV@3fhg?~Iv|@2r`X@0^*#TXXX5f>}_BN6MLK7tNx~>&|4mWR_%Jb*9WImS&ls
zD&{oHvK-zQ*abGhia%A&88*pEpj>2AY#Q%brao0FGp`_*@^M9traOYOTGOcs-m1A)
zn}e3#?|K1cvR&bL9qu-CYMy-ZcxUiD!s9EdqJ$}??r9>+w7%|UP)eaR$@IR?(k->0
z4po-kONqBbWmgF`mSNeJ#&Rqls<z4sFZF&V%&>_Cg%$B=d%C!nlqac-C%d5Z)9zjL
zB|c5!!%Cs{4+@+5d5U4q;CF$|yiD~okRi*`{oE;s5WU%S%;{J?fdrI7!scn4(^1&P
zFdga(%7H36l27ib5>wd41?5;{vuy4}J5HTQ$uJJ;EAzbvf8=xH@gM%kuv}(XBCs0{
zHv&6wxNiiWvESw5$ao|?-xs`TG(BNh-CzeV$M%CL-L!=t2&httbjR|8S6|^1DNzb6
zqfFfNm8?!N&#vgSLf{x>eD3fD7&vy@4x&ucb^<P<Y@7SO)#AO$zzZyAU2wna1b&pZ
z1K##~St;22D_^QBQCj9-ku_IRViGS=qGXC>Nw@3+?ndgym#RU79(rOATpaI^1`)jD
zSPdSf{f=V?m3*W%geX$yStMpY7W)&=k5V>!9%bXQFBYga)o_DICwu!xMO+2t%kOML
zhr-|V1Kam@Hy?YAZkxM-zsdN)Ce&%UAKLZJZeTn9Ce7aF{zm60nj{|nN5X6W2~<lH
z@NXf>X&E)A>S{*IX^ZN#I;(1bkyDpI(benPtXfPVmtOjfu8C`C-XP@;KD2prc%*@U
zh)ke_*us~v<akmqw7)I&)j&PgLiI%LYugIbbwyWRq*NH!Q)Nr>7Y|hxJ401I$3wJA
zx_3}8@S&yex_hp7=o&5VGA^nHR?RUE?O?~KTQDPG1>Bd7kGwAEsC4-O7sjU1u-u!0
zA-b*+?AX4s7VKC7b>{|3f#(@@yG8cq?^uFYB#+)SvjS@~9;-$J^z@C+o)c-J{&6K0
z>9oJ2Ovi5QIULsKSq#+dj@RbQ7LnSFCPw<orHYdkC?=X}@t`(RUj@-88<@=$)xuYL
zYWhF*l(<bzbDvm_71*v9Wt)yCY-Tm+6d8Ork)e)K>Hn0)siGW1=}@o|s7%8Gq!wry
zus569yz&AH20aZ*M$)m%N9Z|OkA2GW;E+Mf0#RBDoA`y=&j$K#I?S@-OSPYCDg8X2
z0?w<6AQP%D)PB)c`jdD{piSYK4zj!1P}|J~c}icXZ1Sb{SotFLryuk$&`GqL3ok4v
zyM^P46FNC+`=7F@A+?xL9X8#caSOrZX$_lNz)box;Y>Ja^_O9hfmi>ctB1vKhH}I9
z1y;g21`YIQd)aVy7xE!Lx2?GO*xW#ayhQt48RqB5`V-Gf=zl5cUmodyDd}HE|3Q8s
zm_EJ`%!G5h7uiKR|6XdxvnP=IL_MAhW{&3>{(?)#Wp)Ygh26!I6#mrX%dE`iaaLYt
zD!u!237`3I$Y(y`Q~rj0$_bx^Z^&mMoI74RQI6lj-6E}YP}<YJP(={V56ZFi4oY$D
z{-<Yp!<Ns~|NWWXu<^6C=v^LAUNiPD!-bJr*u|hE$NuG5@A0vJb*A_D*uO&W`EahO
zg_8#g=J{{oWL$=IExPNJXA3{maQ|PHyX`VtWS4PI7hh`qrC>Q+VxYW8_1_AoS?ZF~
zpAO$*OW4V8vA1NpEYm9sN=xrAhKmF450r2rEGMICg6Oi0QI|0)@?SgZ6^wf2zcK1^
zxIEyFQRl)E=BbINVJVzjP@ZXhqkq+%39f`G)Lnd$iYeigoXu2dgjXRY+M;hc>AR0|
zS@vD&UrYLylfIw~`>ur7WZ&`wCD4wI@B%Y1#yp*G$QkS^bYrlU_{0krX~(18Rpe6a
z+Ft=NonXhcGT4JFpj^k^yB_9tugP_|F3&xUt+E?fiQJHXIb2BiuSotYL;lM{{>vl$
zZzgic*3|RNc&o$lR?m(XUL|hfHA-id{tcArTpyHg4(mZh{o1fTE?2@E;mvR@th97F
z<6-GUy{tUb`#1VG`)mD5m_NRD+VTV^f2^fY&INoc<N6xb+#1p6$b8Y~_IMvzKHH`m
z7BF%pycOOKt1X>IMynKC!!5Lq-1QKvgfnp!XT>_x<@#O^uZ7D)S}LKT7J6AH?O^@Z
z!%Vo2U6KhWq<u~7e~LS;5>5dF-x}UZ*;xhU+doeyw_`P&enY*sdn3#<ovpvr*v9OL
z_5!NGeK=CH(ZAKd-LDRG8g9fh3^UC<z|CgwW<i3D_dAxb+6lJx-;Zg62VLRD1yRNo
z-<y|scL0F_T~_74m_VgO!M!;`!RH#hB4qYTBNnwoZZMC~P{ZrG0q_v_dDW<QLD=Dq
zJt7$7+SrZ2_FO+kNq|P&^}B)_fL6kg(9vKvGh8n)e8=vfoAA1=ovRh?6+u98hX(tu
z<9Qw11+<E5<KBRqT);Qteswq!+Qeu`IsyXlxRQw`9@znqkLUgYKQc29CGa;hKa}V%
zQnw@Zp_!%9BP$@EtKm7_w(FB0CgHmva31ML+!FS`0~p!=j6~o8K)3vcZATfq<$3^o
zYLwwZc)}MXdry0|9_2p~aOD~SXB7>|on{PogI?kR+KBHWF>~%t?QjRqfeD7T<(fI5
z%Z}yR4OvwO%-`#!SN-a$A7$eCMLDa{=wgCLAHO0P_gO_Xi?J9r8?g3yPha&{{fZ_C
z*u`qR_^E{LY^b7+ZS+cTlK|ii4=d+2jb08b6cD)YT^=+iHE!}?2lCj#kzt`5rm9=d
zZT}Sm_sUJNgjbaA2pe97=60fVkHguDv<{2%kAZ(bf?4tyhw8yTYe(5+lqlr@{$@My
zJYXe}zQ>O+$EM94Mt%#}O-N&vuCX5#$S#xoB+RQLJgmPD+la~y&jm^kehm2j{Aq_b
zY|D8B91b?{?XWU#!(%+kFfQNuKv=F1M9ian+uGx@2vE+^yjzE72PJ^(M5pUEg03_l
zahX)PD5px*9YI?mn#79Kpp;td6Q(U8tRv#mHsPbe73g48s{@$NqBPV@zFQLm2bj3t
zhF?|L6jzAz8WJ;Gx9j9lM1^>vYtV?PH$7+~)5aorESkbHNP%i?3t~JMZ&Sxy2Lp*3
zYl;u3sMD&6W#UWzm|1Fe9Y<=eMh=NNb*k2wFzJ#bAt!;=H%o4fI1I~a$zHqzUXj+S
zM_SV|b4{xzZCU1G<JKI|ogp1LNjI}-OAMp5-1TM}W!5xP9+E6tQJtAa2IV~J;@M;7
z$eMZ32L#GWZgLvbl`@gmZb#Z7p7tT$cr1E@ZdqnAR!yv}NZZ~XccV(=M#(@7<89AJ
zL`m@x^ic|#vRZ~8TYwjv(~7FD!D~fMzTwF(kzb1^qt0qYt)S+#Sxr+*D8u^#?KHoX
zDy!O8(|DG&H|3V~WpG^9#?z8kQj6LQ_>vc0RL9E&)K6<U8h4n}wd?8(M%2E_Y1(hf
zDP6s;u8fqYwP|?f)9MX%2DwH2p}kgscaA^IvY;9%LtTXbKCcM@oiynDAAIPDqp*~8
z82<(triKdn=<wFz#LXk$PX%h2f`g@p`mqLxj?fe?;&h-Nr^7Tkh6n-SnnhSi6Kjl0
z6fDYJR$2%nC3Sh!y>nW_G9z^b)P0Ds6x>RJUDPMcHp$;Fe%!mV69gUq{msPpUdI;Q
zXn1WDgWtoZ`k<G)L8kb@s^3d@gXa4Ec;jT6ylpvL-h<+Mm{$uAKA(Tz)TshNxj23x
zPu5CCkWvI`BGLu7nD|U)cX6IksK#P`B_|rx#-OB0wJF5)q8tu~4h}tVjL#${1&Gwv
zn0bn4=<~05{COm^uz(V*tB940AF>JAra@~m_|UCIWw}0Iz?d1F?Jx(YUJFzLMdYxY
zXfIU2o0zUAbajlLPUzYg9d5nkL#sN<C!5gqF?uecr^o2|gq|6r7ZQ5*l+Gsld4S8;
z%^{|rpF>PPKZlrpehxAH{2XHX`8mY&^K*#l=jV_Qb74NQQR}Y(BYwggxC{*n?isdg
z_!Mih8~`S|1L;l%#vqtu>{z~GGqkl)hcG9iO+;~x!F>Q2WC3pBMl}3xV+U{v;VF6c
z0e>MvhcOV+lhDWmJ`+Y8;UGbGy5HFG9Nd)#eGt1sj0~Z%buh+2f>Yq=00C7`!aBFx
zuA?(aMx!JHKn}uuQfy;wYyG{-hVlCVXa<m?PnZrM(Sw8@iFm-BBOpiD$9x13qvz2b
zj*$kCE|wi-imyt>BpE|T;gi4^{B$%qv>XD#JWsT#7R+%s0*?asRfnNIx@D@y5X)3Y
zbkT%dfNmScM-&XMlGt87uxu9^7y+{=#W&HZ2xv~iNnWG48B%Zy3L4JJs2>?!IZ?W@
zC>DsQA+3>*k)d7*nZ3GIw-L~lbF`pOVowK^QRk$i2Ji(fKX_TRpq6W_eWz+{-GRP8
zJOUHod7ui3vPr5n3W@XQ4X4YfJ0jLiT2~1mWN<vzBaK2WM<yVRk8nDGCV+NgN|h?G
zqBxzDFI5da8Hb0fu|R85=*p&)ZjCsi6Mdd+uNn|IuBaN@Pm&IsAjWI)1SD#KXT$g`
zo(q8<3b4u`J7zAI8(Om}u?Io$-*e%Hz!9J+_#R<DD1q4)RS}%>p?F$IPg-3dx}#P#
zT#UALM6O7chM;W-8^t2QtwDg-g)0+p1FUOe0`YDnOpIac6~l)2W4Ywa0G&&}h~mva
zv9MbHyb9j#dfj#{aRem_Lt@T^Rfl^Whi^zVJq1K{*;x}zd@0NUHjL>dfE?omyyZdC
z;f{FCN>b%{Q~d3Xkfe$YacCl6W05#r;xAS(A{;49a&2o1)_M0e*7-)u{CYEc_;7V(
z5E3mJD6zUR0S5NBjc(G(N;+?jwW}HrVVz^d&CwpQb#0^7DD_N}vo=ezy}@p!)qE7k
z;*CFsn<$LOFhw#_@<>Kc9qB?R)_L9<&sxO5C}VM9l~Rn?TIP>n)UZFV>q}YM_H)kG
z>5NSDfDJ`vy5r$%k;5Gl>&U=43D@6l%9A>vjYiiHHW=#;ZwRZ6FG9M+HWtn9$>WBY
zC1>w|Adhi72?fN~Z)|e|4*wn<<%gD#o3`0S3<EOI9Vt&t>@J5HJKX7DW0S_}91azI
zw}T7C56Db+yhEz+yPb|F0)y{&aq89aUCrq&iEW--t{bNI<cUOdrndbAg>BIWcoO?a
zOzq(|s1NaeX#EVB>UZ(RenxlwpW$0fgl`<@?wzfB@xjDdmBIQzxIZK`EPV4QKzet-
zYXOeXWpaRH67bk|ZENqechz`IYi6VW-8&n1wl?nE+qnPU+xOnRPlu`qJOFJEbz8BW
z;K;Mw7L|u%0UVI4EOt3RrQ@3KItRAwX}9b6Vhi}F|G$nmzJ(jFg&VP{eC@*acDycp
za);Mw(QEhKdH3$t)*Y<Q|1~;!hJ%~%(i`9jnh^Ss%?O_ix{Lypq|LVo<#z?4#P*Ck
zi|^5?cIQ39HdVk#kI}I=h+?~~*Vyp?7{kDdPO%2v(7z>!A&aC?@f%Rg8P<~R8H_T3
zG2@&@*?x;-B>MV)h+`^x9H#yWp%;q%m>COSskp3C5W7U%qr{<P2ylc;MURpWk|+-V
zg<mSc<%>$g?2jp~J{FmxU$4QtsSO+Xzea!m1`=H@mne98`loB-l$<8|Xo@c>s(6WY
z%FiJ{+--M`r0n=z43M1Viyu%uu@yh0<P%D^Dft|UnU6uG=ZH8WUqpD#vFjW7HVi*$
zN})+qN=O1QkRy0c@jB6^<QgUP3mF;WlyNjE)yP!o_fbOr-X`~;?Xj-Izem4U@`;pF
batOZ-{)!06Q8*6a;wy!s^5?lL#mWBy$($F%

diff --git a/utils/__pycache__/queries.cpython-39.pyc b/utils/__pycache__/queries.cpython-39.pyc
deleted file mode 100644
index 01db4841ccd562f77608c750a828f537a8c8f6b8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1893
zcmbVN&2L*p5Z~SR;b+HglQaaSMC7Sz0gDp1h#o*u)zUy)36hGmDz#Kao40Fw?f2pC
zuH(jf@04CE;Ea%xWB&^NfgU*TfeT!E;liy4VD>p}(+VzZH1p=Svokw8o|(`_qlRGp
z{Kuo6PbvugX2J1A09*l53&0p+_z`L&MwkO%%-k*1^dc8jFIE)<ALts8`U_a@Fgzgg
z1S5r;)&V(yPBXV~nb*aImt*D)9NEiJALWGktkNYcVAUK8%xaI_!q0s+vxHb3hzwl$
z^awXB2C6Kf!b=(0T0c*%A9f-qKO%PWCx^|J{^r>HW2f*Th6?kF;FX6Qtl8|;(*Zt&
z2rP#P|MeDjD)(}C3GHI}tM!vEPcb{SgboN>U~e6g1Lv@e3vLM@JByRGJBo+vn>Rz6
zvXDwGqL_zT=$NZeXW@epmwVwgnJFcCC+uW0q$AyfPb`!+UPnl!pFtv#abpVN->?ia
z4!AV_^yo8e)eg%<(=}e{(fEqTJOT%?NQ5?iN5q;-Q%Sf|w96+AooO0>B)J;JS{YAh
zo~TKs4D7Qr&+!@KmGU!NA5BNvkDRfrZZGq?Vwa}|uRh12y>y((0eFeCvAsm{VN4@#
zJT;7kZdMJ6q_m|W=YX}V)39%4$~b~OG?gh<a?xs?DAmSIX~Lf%`7Z1{ztn;pO4U+Y
zsI1>w&!SPnQ>|Kz@3tVFH2qxcv_@LQs%2j<xLO_VnTGYDzLZ(=Eu_}wNWBLPkpTNR
zz%KDYI*n)XBA{jL!p>Q|;MB42EE5^R$;cYLSoYlkP+8g=pwXz08P3sS*$|IOfdwjv
zh7!ne-|c&ccpcsUr2u4c4u!vj`hMZUi3)@Ae}r;4wT|}jAv8j{>%Xz<>i|m3V-;(=
zkB?9=a4dQU6_ul%>VN11(EG31+je?oUp>;lvVUD)%PV=1SMyre%}MU8p$BJ=@37h$
z($yTXnYW=qfg1bdyRewiwSyTr58r<M(81^~w4{q%v=4GRe-V9;)s+hfmw)$wvlGxN
zCnrg*YU_WL4$^F#hFzXAF3*Qhx^Xxbx)<(H=p{)tSGDS1HiAviq<ojluoXr$eP4%i
zl!m$|RQQ4JQEhE`2uNpHxFfo@<5Z7I-mFz8E3I=-LR8b%1|MIYcIG<3k1t&ObTv}D
z(hk2SHhzy&2;{Y5uqr$Ewt&jn*hM+IUoYUNW@rihKpxoHh)Wk`5kDb@tX?qqVTq<}
z1&mE?%{F?JCBw@f0{#;SO;)CF-?k54U5m4b#_BQ%FHZiiSZwi5FFx~9vi&7(9P%8n
z^H#7uV(`}n-!b^E_#OT$Oy;I(Iu_K#T^$Z$L&iHB&4wXE)(#|OX&cgJNIEqBC>79(
zZF1|L@poxF;;K!qUN;_<lI|HNhT-96L*V{UM1z<cmyMEPd-lf7>o>mGy1jkto6Xys
z*EYXu)=j0CC7cP_t|U~X+pJ?|jzwbJJx*oY*@J!3p@zh??RMyPQu1*drlysFj}7j;
zk$rgy9vcFwWnjpgcU(MQPJ~4g;;^K7GD{Y4d2*vUoeL#6=IUh@O|X_^Y!vg$<+!m`
H60H0KNH*8r


From b2956e921c6af62355c5cff434735cd53b42a358 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 22:51:54 -0500
Subject: [PATCH 45/47] update for tests

---
 cleanBib.ipynb         | 462 ++++-------------------------------------
 utils/preprocessing.py |  33 +--
 utils/queries.py       | 219 ++++++++++++++++---
 3 files changed, 245 insertions(+), 469 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index 60d6ac3..1d1473e 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -43,19 +43,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "No optional .tex file found.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import glob\n",
     "from habanero import Crossref\n",
@@ -66,11 +58,16 @@
     "sys.path.insert(1, f'{wd.absolute()}/utils')\n",
     "from preprocessing import *\n",
     "from ethnicolr import pred_fl_reg_name\n",
+    "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n",
     "import tensorflow as tf\n",
     "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)\n",
+    "import warnings\n",
+    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
     "\n",
     "cr = Crossref()\n",
-    "homedir = '/Users/stisoj/Documents/dev/cleanBib/tests/immaculate/'\n",
+    "homedir = '/home/jovyan/'\n",
     "bib_files = glob.glob(homedir + '*.bib')\n",
     "paper_aux_file = glob.glob(homedir + '*.aux')\n",
     "paper_bib_file = 'library_paper.bib'\n",
@@ -112,40 +109,31 @@
     "checkingPublishedArticle = True\n",
     "```\n",
     "\n",
-    "Then, run the code block below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)"
+    "Then, run the code block below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)\n",
+    "\n",
+    "__NOTE__: Please edit your .bib file using information printed by the code and provided in cleanedBib.csv. Edit directly within the Binder environment by clicking the .bib file (as shown below), making modifications, and saving the file (as shown below).\n",
+    "\n",
+    "![open button](img/openBib.png)\n",
+    "\n",
+    "![save button](img/saveBib.png)\n",
+    "\n",
+    "Common issues include:\n",
+    "\n",
+    "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n",
+    "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n",
+    "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1: buzsaki2013memory\n",
-      "2: Lundine2019\t\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\n",
-      "3: zurn2020network\n",
-      "4: moralia2005\n",
-      "5: bassett2022curious\n",
-      "6: fake2022  <-- self-citation\n",
-      "7: jurafsky2018n\t\t  <--  ***NAME MISSING OR POSSIBLY INCOMPLETE***\n",
-      "8: mitchell2013gendered\n",
-      "9: chatterjee2021gender\n",
-      "10: fulvio2021imbalance\n",
-      "11: ethnicolr2022black\n",
-      "12: ethnicolr2022hispanic\n",
-      "13: ethnicolr2022asian\n",
-      "14: ethnicolr2022white\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "yourFirstAuthor = 'Stiso, Jennifer'\n",
-    "yourLastAuthor = 'Zhou, Dale'\n",
+    "yourFirstAuthor = 'LastName, FirstName'\n",
+    "yourLastAuthor = 'LastName, FirstName'\n",
     "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n",
     "checkingPublishedArticle = False\n",
     "\n",
@@ -159,37 +147,14 @@
     "    # find and print duplicates\n",
     "    bib_data = get_duplicates(bib_data, bib_files[0])\n",
     "    # get names, remove CDS, find self cites\n",
-    "    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "kernel": "R"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "STOP: Please remove self-citations. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n",
-      "['fake2022']\n",
-      "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n",
-      "['Lundine2019', 'jurafsky2018n']\n",
-      "Only continue if you've run steps 2, and this code no longer returns errors.\n"
-     ]
-    }
-   ],
-   "source": [
+    "    get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)\n",
+    "    \n",
     "bib_check(homedir)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "source": [
     "## 3. Estimate gender and race of authors from cleaned bibliography\n",
     "\n",
@@ -200,18 +165,6 @@
     "\n",
     "[You can find your key in your account's profile page.](https://gender-api.com/en/account/overview#my-api-key)\n",
     "\n",
-    "__NOTE__: Please edit your .bib file using information printed by the code and provided in cleanedBib.csv. Edit directly within the Binder environment by clicking the .bib file (as shown below), making modifications, and saving the file (as shown below).\n",
-    "\n",
-    "![open button](img/openBib.png)\n",
-    "\n",
-    "![save button](img/saveBib.png)\n",
-    "\n",
-    "Common issues include:\n",
-    "\n",
-    "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n",
-    "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n",
-    "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown.\n",
-    "\n",
     "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n",
     "\n",
     "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)"
@@ -219,38 +172,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {
     "kernel": "R"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Remaining credits: 262\n",
-      "This should use (at most) 25 credits, saving you approx 1 credit(s) by storing queries.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "genderAPI_key = '&key='\n",
     "\n",
-    "# The following saves the api key to a txt file just to be reloaded by the next cell\n",
-    "with open(\"genderAPIkey.txt\", 'w') as f:\n",
-    "    f.write(genderAPI_key)\n",
-    "\n",
     "# Check your credit balance\n",
-    "authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')\n",
-    "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n",
-    "response = urlopen(url)\n",
-    "decoded = response.read().decode('utf-8')\n",
-    "decoded_json = json.loads(decoded)\n",
-    "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n",
-    "print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \\\n",
-    "        'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-\n",
-    "                            (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \\\n",
-    "      ' credit(s) by storing queries.')"
+    "check_genderAPI_balance(genderAPI_key, homedir)"
    ]
   },
   {
@@ -268,222 +199,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {
     "kernel": "Python 3"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "first author is Jennifer Stiso \n",
-      "last author is Dale Zhou \n",
-      "we don't count these, but check the predictions file to ensure your names did not slip through!\n",
-      "looping through your references, predicting gender and race\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|          | 0/23 [00:00<?, ?it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      "  4%|▍         | 1/23 [00:01<00:24,  1.11s/it]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      "  9%|▊         | 2/23 [00:01<00:19,  1.06it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 13%|█▎        | 3/23 [00:02<00:17,  1.13it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 17%|█▋        | 4/23 [00:03<00:16,  1.17it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 22%|██▏       | 5/23 [00:03<00:12,  1.44it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 35%|███▍      | 8/23 [00:04<00:06,  2.30it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 39%|███▉      | 9/23 [00:05<00:06,  2.31it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 43%|████▎     | 10/23 [00:06<00:06,  1.91it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 48%|████▊     | 11/23 [00:06<00:07,  1.68it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 52%|█████▏    | 12/23 [00:07<00:07,  1.54it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 57%|█████▋    | 13/23 [00:08<00:05,  1.72it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 65%|██████▌   | 15/23 [00:08<00:04,  1.98it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 70%|██████▉   | 16/23 [00:09<00:04,  1.70it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 74%|███████▍  | 17/23 [00:10<00:03,  1.82it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 78%|███████▊  | 18/23 [00:10<00:03,  1.62it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 83%|████████▎ | 19/23 [00:11<00:02,  1.50it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 87%|████████▋ | 20/23 [00:12<00:02,  1.41it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 91%|█████████▏| 21/23 [00:13<00:01,  1.35it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      " 96%|█████████▌| 22/23 [00:13<00:00,  1.52it/s]/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:173: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  fa_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:176: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True)\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:177: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  la_data = np.array(\n",
-      "/Users/stisoj/Documents/dev/cleanBib/utils/queries.py:180: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
-      "  paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True)\n",
-      "100%|██████████| 23/23 [00:14<00:00,  1.58it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Queried gender api 35 times out of 46 entries\n",
-      "Queried race/ethnicity api 36 times out of 46 entries\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "/Users/stisoj/opt/anaconda3/envs/cleanBib/lib/python3.10/site-packages/numpy/core/_methods.py:163: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  arr = asanyarray(a)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "f = open(\"genderAPIkey.txt\", \"r\")\n",
-    "genderAPI_key = f.readline().replace('\\n', '')\n",
-    "\n",
-    "import tensorflow as tf\n",
-    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
-    "\n",
-    "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)\n",
+    "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix, paper_df = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)\n",
     "statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)"
    ]
   },
@@ -521,86 +243,11 @@
     "print('LaTeX template:')\n",
     "print(statementLatex)\n",
     "\n",
-    "cmap = sns.diverging_palette(220, 10, as_cmap=True)\n",
-    "names = ['white_m','api_m','hispanic_m','black_m','white_w','api_w','hispanic_w','black_w']\n",
-    "plt.close()\n",
-    "sns.set(style='white')\n",
-    "fig, axes = plt.subplots(ncols=2,nrows=1,figsize=(7.5,4))\n",
-    "axes = axes.flatten()\n",
-    "plt.sca(axes[0])\n",
-    "heat = sns.heatmap(np.around((citation_matrix/citation_matrix.sum())*100,2),annot=True,ax=axes[0],annot_kws={\"size\": 8},cmap=cmap,vmax=1,vmin=0)\n",
-    "axes[0].set_ylabel('first author',labelpad=0)  \n",
-    "heat.set_yticklabels(names,rotation=0)\n",
-    "axes[0].set_xlabel('last author',labelpad=1)  \n",
-    "heat.set_xticklabels(names,rotation=90) \n",
-    "heat.set_title('percentage of citations')  \n",
-    "\n",
-    "citation_matrix_sum = citation_matrix / np.sum(citation_matrix) \n",
-    "\n",
-    "expected = np.load('/%s/data/expected_matrix_florida.npy'%(homedir))\n",
-    "expected = expected/np.sum(expected)\n",
-    "\n",
-    "percent_overunder = np.ceil( ((citation_matrix_sum - expected) / expected)*100)\n",
-    "plt.sca(axes[1])\n",
-    "heat = sns.heatmap(np.around(percent_overunder,2),annot=True,ax=axes[1],fmt='g',annot_kws={\"size\": 8},vmax=50,vmin=-50,cmap=cmap)\n",
-    "axes[1].set_ylabel('',labelpad=0)  \n",
-    "heat.set_yticklabels('')\n",
-    "axes[1].set_xlabel('last author',labelpad=1)  \n",
-    "heat.set_xticklabels(names,rotation=90) \n",
-    "heat.set_title('percentage over/under-citations')\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "plt.savefig('/home/jovyan/race_gender_citations.pdf')\n",
-    "\n",
-    "\n",
-    "paper_df.to_csv('/home/jovyan/predictions.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "kernel": "R"
-   },
-   "outputs": [],
-   "source": [
-    "# Plot a histogram #\n",
-    "names = pd.read_csv('/home/jovyan/predictions.csv')\n",
-    "total_citations = names.CitationKey.nunique()\n",
-    "names.GendCat = names.GendCat.str.replace('female', 'W', regex=False)\n",
-    "names.GendCat = names.GendCat.str.replace('male', 'M', regex=False)\n",
-    "names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False)\n",
-    "gend_cats = names['GendCat'].dropna().unique()  # get a vector of all the gender categories in your paper\n",
-    "\n",
-    "# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
-    "dat_for_plot = names.groupby('GendCat').size().reset_index()\n",
-    "dat_for_plot.rename(columns={0:'count'}, inplace=True)\n",
-    "dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)\n",
-    "\n",
-    "# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n",
-    "dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'WW') |\n",
-    "                                         (dat_for_plot.GendCat == 'MW') |\n",
-    "                                         (dat_for_plot.GendCat == 'WM') |\n",
-    "                                         (dat_for_plot.GendCat == 'MM'),:]\n",
-    "# MM,MW,WM,WW\n",
-    "baserate = [58.4, 9.4, 25.5, 6.7]\n",
-    "dat_for_baserate_plot['baserate'] = baserate\n",
-    "dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate=\n",
-    "                                                     dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate\n",
-    "                                                     )\n",
-    "\n",
-    "# plot\n",
-    "plt.figure()\n",
-    "sns.barplot(data=dat_for_plot, x='GendCat', y='count', order=np.flip(gend_cats))\n",
-    "plt.xlabel('Predicted gender category')\n",
-    "plt.ylabel('Number of papers')\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "plt.figure()\n",
-    "sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='citation_rel_to_baserate', order=['WW','WM','MW','MM'])\n",
-    "plt.xlabel('Predicted gender category')\n",
-    "plt.ylabel('% of citations relative to benchmarks')\n",
-    "plt.tight_layout()"
+    "paper_df.to_csv('/home/jovyan/predictions.csv')\n",
+    "\n",
+    "plot_heatmaps(citation_matrix, homedir)\n",
+    "\n",
+    "plot_histograms()"
    ]
   },
   {
@@ -624,38 +271,13 @@
    },
    "outputs": [],
    "source": [
-    "cite_gender = pd.read_csv(homedir+'Authors.csv') # output of getReferenceGends.ipynb\n",
-    "cite_gender.index = cite_gender.CitationKey\n",
-    "cite_gender['Color'] = '' # what color to make each gender category\n",
-    "colors = {'MM':'red','MW':'blue','WW':'green','WM':'magenta','UU':'black',\n",
-    "'MU':'black','UM':'black','UW':'black','WU':'black'}\n",
-    "for idx in cite_gender.index: # loop through each citation key and set color\n",
-    "    cite_gender.loc[idx,'Color'] = colors[cite_gender.loc[idx,'GendCat']]\n",
-    "cite_gender.loc[cite_gender.index[cite_gender.SelfCite=='Y'],'Color'] = 'black' # make self citations black\n",
-    "\n",
-    "fin = open(homedir+tex_file)\n",
-    "texdoc=fin.readlines()\n",
-    "with open(homedir+tex_file[:-4]+'_gendercolor.tex','w') as fout:\n",
-    "    for i in range(len(texdoc)):\n",
-    "        s = texdoc[i]\n",
-    "        cite_instances = re.findall('\\\\\\\\cite\\{.*?\\}',s)\n",
-    "        cite_keys = re.findall('\\\\\\\\cite\\{(.*?)\\}',s)\n",
-    "        cite_keys = [x.split(',') for x in cite_keys]\n",
-    "        cite_keys_sub = [['\\\\textcolor{' + cite_gender.loc[x.strip(),'Color'] + '}{\\\\cite{'+x.strip()+'}}' for x in cite_instance] for cite_instance in cite_keys]\n",
-    "        cite_keys_sub = ['\\\\textsuperscript{,}'.join(x) for x in cite_keys_sub]\n",
-    "        for idx,cite_instance in enumerate(cite_instances):\n",
-    "            s = s.replace(cite_instances[idx],cite_keys_sub[idx])\n",
-    "        fout.write(s)\n",
-    "        # place color key after abstract\n",
-    "        if '\\\\section*{Introduction}\\n' in s:            \n",
-    "            l = ['\\\\textcolor{' + colors[k] + '}{'+k+'}' for k in colors.keys()]\n",
-    "            fout.write('\\tKey: '+ ', '.join(l)+'.\\n')"
+    "colorful_latex(paper_df, homedir, tex_file)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.10.5 ('cleanBib')",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -669,7 +291,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.5"
+   "version": "3.10.6"
   },
   "sos": {
    "kernels": [
diff --git a/utils/preprocessing.py b/utils/preprocessing.py
index 58b210b..fca08ed 100644
--- a/utils/preprocessing.py
+++ b/utils/preprocessing.py
@@ -189,12 +189,13 @@ def get_duplicates(bib_data, filename):
             raise ValueError("Unable to successfully remove duplicates")
 
     if len(duplicates) > 0:
-        print("In your .bib file, we found and removed duplicate entries for the following entries:\n " +
+        print("\n In your .bib file, we found and removed duplicate entries for the following entries:\n " +
                       ' '.join(map(str, duplicates)) +
-              "\n If this is incorrect, please edit you .bib file to give unique identifiers for all unique references:")
+              "\n If this is incorrect, please edit your .bib file to give unique identifiers for all unique references. \n")
 
+    if len(duplicates) > 0:
         # write new data to file
-        new_bib = filename[:-4] + '_clean.bib'
+        new_bib = filename[:-4] + '_noDuplicates.bib'
         with open(new_bib, 'w') as bibtex_file:
             bibtexparser.dump(bib_data, bibtex_file)
 
@@ -312,6 +313,7 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC
     for key in bib_data.entries.keys():
         diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',
                                 'The gender citation gap in international relations',
+                                'Gendered citation patterns in international relations journals',
                                 'Quantitative evaluation of gender bias in astronomical publications from citation counts',
                                 '\# CommunicationSoWhite',
                                 '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',
@@ -321,7 +323,9 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC
                                 'Gender Diversity Statement and Code Notebook v1.1',
                                 'Gendered citation practices in the field of communication',
                                 'Gender disparity in citations in high- impact journal articles',
+                                'Gender Disparity in Citations in High-Impact Journal Articles',
                                 'Gender (im)balance in citation practices in cognitive neuroscience',
+                                'Gender (Im)balance in Citation Practices in Cognitive Neuroscience',
                                 'Name-ethnicity classification from open sources',
                                 'Predicting race and ethnicity from the sequence of characters in a name']
         if bib_data.entries[key].fields['title'] in diversity_bib_titles:
@@ -473,13 +477,16 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor
 
 def bib_check(homedir):
     # Do a final check on the bibliography entries
+    authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
+    skip_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey'])
+
     with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:
         names_csv = csv.reader(csvfile)
         names_db = []
         for row in names_csv:
             names_db.append(row)
 
-    incomplete_name_bib_keys, self_cite_bib_keys = [[], []]
+    incomplete_name_bib_keys = []
     authors_full_list = []
     for row in names_db[1:]:  # Skip the first row, it's just headers
         # Check that the authors' names have at least 2 characters and no periods
@@ -487,24 +494,18 @@ def bib_check(homedir):
         authors_full_list.append(first_author)  # For counting the number of query calls needed
         authors_full_list.append(last_author)
         if len(first_author) < 2 or len(last_author) < 2 or '.' in first_author + last_author:
-            incomplete_name_bib_keys.append(bib_key)
-        if self_cite == 'Y':
-            self_cite_bib_keys.append(bib_key)
-
-    if len(self_cite_bib_keys) > 0:
-        warning_message = "STOP: Please remove self-citations. Then, re-run step 2. "
-        warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: "
-        print(warning_message)
-        print(self_cite_bib_keys)
+            if bib_key not in skip_selfCites:
+                incomplete_name_bib_keys.append(bib_key)
 
     if len(incomplete_name_bib_keys) > 0:
-        warning_message = "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. "
+        warning_message = "\n STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. "
         warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: "
         print(warning_message)
         print(incomplete_name_bib_keys)
 
-    final_warning_message = "Only continue if you've run steps 2,"
-    final_warning_message += " and this code no longer returns errors."
+    final_warning_message = "\n Only continue if you've run step 2,"
+    final_warning_message += " and this code no longer returns error or instructions to revise the .bib file."
+    print("\n")
     print(final_warning_message)
 
 
diff --git a/utils/queries.py b/utils/queries.py
index 4f7ff41..d09f615 100644
--- a/utils/queries.py
+++ b/utils/queries.py
@@ -9,6 +9,8 @@
 from urllib.parse import quote
 from urllib.request import urlopen
 import json
+import matplotlib.pyplot as plt
+import seaborn as sns
 
 def namesFromXref(cr, doi, title, authorPos):
     '''Use DOI and article titles to query Crossref for author list'''
@@ -75,6 +77,27 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
     race = []
 
     idx = 0
+    # skip self-citations
+    authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
+    skip_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey'])
+    # skip citation diversity statement papers
+    diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',
+                            'The gender citation gap in international relations',
+                            'Gendered citation patterns in international relations journals',
+                            'Quantitative evaluation of gender bias in astronomical publications from citation counts',
+                            '\# CommunicationSoWhite',
+                            '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',
+                            'Gendered citation patterns across political science and social science methodology fields',
+                            'Gender Diversity Statement and Code Notebook v1.0',
+                            'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender',
+                            'Gender Diversity Statement and Code Notebook v1.1',
+                            'Gendered citation practices in the field of communication',
+                            'Gender disparity in citations in high- impact journal articles',
+                            'Gender Disparity in Citations in High-Impact Journal Articles',
+                            'Gender (im)balance in citation practices in cognitive neuroscience',
+                            'Gender (Im)balance in Citation Practices in Cognitive Neuroscience',
+                            'Name-ethnicity classification from open sources',
+                            'Predicting race and ethnicity from the sequence of characters in a name']
     # save base gender rates
     gender_base = get_gender_base(homedir)
     # make a dictionary of names so we don't query the same thing twice
@@ -83,6 +106,10 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
     n_gen_queries = 0
     n_race_queries = 0
     for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)):
+        if paper in skip_selfCites:
+            continue
+        if bibfile.entries[paper].fields['title'] in diversity_bib_titles:
+            continue
         if 'author' not in bibfile.entries[paper].persons.keys():
             continue  # some editorials have no authors
         if 'year' not in bibfile.entries[paper].fields.keys():
@@ -209,7 +236,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho
     mm, wm, mw, ww = np.mean(gender, axis=0) * 100
     WW, aw, wa, aa = np.mean(race, axis=0) * 100
 
-    return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix
+    return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix, paper_df
 
 def gen_api_query(gender_key, name, gb):
     url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(name))
@@ -225,51 +252,177 @@ def gen_api_query(gender_key, name, gb):
     return gender, g
 
 def print_statements(mm, wm, mw, ww, WW, aw, wa, aa):
-    statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \
-    are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \
-    diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \
-    and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \
-    (and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \
-    MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \
-    databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \
-    Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \
-    first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \
-    color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \
-    is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \
-    it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \
-    We look forward to future work that could help us to better understand how to support equitable practices in science."
+    statement = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars "
+    "are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the "
+    "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first "
+    "and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure "
+    "and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), "
+    "MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the "
+    "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. "
+    "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a "
+    "first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of "
+    "color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method "
+    "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) "
+    "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. "
+    "We look forward to future work that could help us to better understand how to support equitable practices in science.")
 
     statement = statement.replace('MM', str(np.around(mm, 2)))
     statement = statement.replace('WM', str(np.around(wm, 2)))
     statement = statement.replace('MW', str(np.around(mw, 2)))
     statement = statement.replace('ww', str(np.around(ww, 2)))
-    statement = statement.replace('WW', str(np.around(WW, 2)))
-    statement = statement.replace('AW', str(np.around(aw, 2)))
-    statement = statement.replace('WA', str(np.around(wa, 2)))
+    statement = statement.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
+    statement = statement.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
+    statement = statement.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
     statement = statement.replace('AA', str(np.around(aa, 2)))
 
-    statementLatex = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \
-    are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\
-    diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \
-    and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \
-    (and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), \
-    MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \
-    databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \
-    Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \
-    first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of \
-    color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method \
-    is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \
-    it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names.  \
-    We look forward to future work that could help us to better understand how to support equitable practices in science."
+    statementLatex = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars "
+    "are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the "
+    "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first "
+    "and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure "
+    "(and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), "
+    "MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the "
+    "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. "
+    "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a "
+    "first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of "
+    "color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method "
+    "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) "
+    "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. "
+    "We look forward to future work that could help us to better understand how to support equitable practices in science.")
 
     statementLatex = statementLatex.replace('MM', str(np.around(mm, 2)))
     statementLatex = statementLatex.replace('WM', str(np.around(wm, 2)))
     statementLatex = statementLatex.replace('MW', str(np.around(mw, 2)))
     statementLatex = statementLatex.replace('ww', str(np.around(ww, 2)))
-    statementLatex = statementLatex.replace('WW', str(np.around(WW, 2)))
-    statementLatex = statementLatex.replace('AW', str(np.around(aw, 2)))
-    statementLatex = statementLatex.replace('WA', str(np.around(wa, 2)))
+    statementLatex = statementLatex.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
+    statementLatex = statementLatex.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
+    statementLatex = statementLatex.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
     statementLatex = statementLatex.replace('AA', str(np.around(aa, 2)))
 
     return statement, statementLatex
 
+def plot_heatmaps(citation_matrix, homedir):
+    cmap = sns.diverging_palette(220, 10, as_cmap=True)
+    names = ['white_m','api_m','hispanic_m','black_m','white_w','api_w','hispanic_w','black_w']
+    plt.close()
+    sns.set(style='white')
+    fig, axes = plt.subplots(ncols=2,nrows=1,figsize=(7.5,4))
+    axes = axes.flatten()
+    plt.sca(axes[0])
+    heat = sns.heatmap(np.around((citation_matrix/citation_matrix.sum())*100,2),annot=True,ax=axes[0],annot_kws={"size": 8},cmap=cmap,vmax=1,vmin=0)
+    axes[0].set_ylabel('first author',labelpad=0)  
+    heat.set_yticklabels(names,rotation=0)
+    axes[0].set_xlabel('last author',labelpad=1)  
+    heat.set_xticklabels(names,rotation=90) 
+    heat.set_title('percentage of citations')  
+
+    citation_matrix_sum = citation_matrix / np.sum(citation_matrix) 
+
+    expected = np.load('/%s/data/expected_matrix_florida.npy'%(homedir))
+    expected = expected/np.sum(expected)
+
+    percent_overunder = np.ceil( ((citation_matrix_sum - expected) / expected)*100)
+    plt.sca(axes[1])
+    heat = sns.heatmap(np.around(percent_overunder,2),annot=True,ax=axes[1],fmt='g',annot_kws={"size": 8},vmax=50,vmin=-50,cmap=cmap)
+    axes[1].set_ylabel('',labelpad=0)  
+    heat.set_yticklabels('')
+    axes[1].set_xlabel('last author',labelpad=1)  
+    heat.set_xticklabels(names,rotation=90) 
+    heat.set_title('percentage over/under-citations')
+    plt.tight_layout()
+
+    plt.savefig('/home/jovyan/race_gender_citations.pdf')
+
+def plot_histograms():
+    # Plot a histogram #
+    names = pd.read_csv('/home/jovyan/predictions.csv')
+    total_citations = names.CitationKey.nunique()
+    names.GendCat = names.GendCat.str.replace('female', 'W', regex=False)
+    names.GendCat = names.GendCat.str.replace('male', 'M', regex=False)
+    names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False)
+    gend_cats = names['GendCat'].dropna().unique()  # get a vector of all the gender categories in your paper
+
+    # Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #
+    dat_for_plot = names.groupby('GendCat').size().reset_index()
+    all_cats = ['MU', 'WW', 'UM', 'MW', 'WM', 'UW', 'MM']
+    empty_dat_for_plot = pd.DataFrame(0, index=np.arange(7), columns=['GendCat', 0])
+    empty_dat_for_plot['GendCat'] = all_cats
+    set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat'])
+    for i in set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat']):
+        empty_dat_for_plot.loc[empty_dat_for_plot['GendCat'] == i, 0] = dat_for_plot.loc[dat_for_plot['GendCat']== i, 0].values
+    dat_for_plot = empty_dat_for_plot
+    dat_for_plot.rename(columns={0:'count'}, inplace=True)
+    dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)
+
+    # Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks
+    dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'WW') |
+                                             (dat_for_plot.GendCat == 'MW') |
+                                             (dat_for_plot.GendCat == 'WM') |
+                                             (dat_for_plot.GendCat == 'MM'),:]
+    # MM,MW,WM,WW
+    # 58.4% for man/man, 9.4% for man/woman, 25.5% for woman/man, and 6.7% for woman/woman
+    baserate = [6.7, 9.4, 25.5, 58.4]
+    dat_for_baserate_plot['baserate'] = baserate
+    dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate=
+                                                         dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate
+                                                         )
+
+    # plot
+    plt.figure()
+    sns.barplot(data=dat_for_plot, x='GendCat', y='count', order=np.flip(gend_cats))
+    plt.xlabel('Predicted gender category')
+    plt.ylabel('Number of papers')
+    plt.tight_layout()
+
+    plt.figure()
+    sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='citation_rel_to_baserate', order=['WW','WM','MW','MM'])
+    plt.xlabel('Predicted gender category')
+    plt.ylabel('% of citations relative to benchmarks')
+    plt.tight_layout()
+
+
+def check_genderAPI_balance(genderAPI_key, homedir):
+    authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
+    authors_full_list = authors_full_list.loc[authors_full_list['SelfCite'] == 'N']
+
+    url = "https://gender-api.com/get-stats?key=" + genderAPI_key
+    response = urlopen(url)
+    decoded = response.read().decode('utf-8')
+    decoded_json = json.loads(decoded)
+    print('Remaining credits: %s'%decoded_json["remaining_requests"])
+    print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \
+            'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-
+                                (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \
+          ' credit(s) by storing queries.')
+
+
+def colorful_latex(paper_df, homedir, tex_file):
+    cite_gender = paper_df[1::2]
+    cite_gender.GendCat = cite_gender.GendCat.str.replace('female', 'W', regex=False)
+    cite_gender.GendCat = cite_gender.GendCat.str.replace('male', 'M', regex=False)
+    cite_gender.GendCat = cite_gender.GendCat.str.replace('unknown', 'U', regex=False)
+    cite_gender.index = cite_gender.CitationKey
+    cite_gender['Color'] = '' # what color to make each gender category
+    colors = {'MM':'red','MW':'blue','WW':'green','WM':'magenta','UU':'black',
+    'MU':'black','UM':'black','UW':'black','WU':'black'}
+    for idx in cite_gender.index: # loop through each citation key and set color
+        cite_gender.loc[idx,'Color'] = colors[cite_gender.loc[idx,'GendCat']]
+
+    fin = open(homedir+tex_file)
+    texdoc=fin.readlines()
+    with open(homedir+tex_file[:-4]+'_gendercolor.tex','w') as fout:
+        for i in range(len(texdoc)):
+            s = texdoc[i]
+            cite_instances = re.findall('\\\\cite\{.*?\}',s)
+            cite_keys = re.findall('\\\\cite\{(.*?)\}',s)
+            cite_keys = [x.split(',') for x in cite_keys]
+            cite_keys_sub = [['\\textcolor{' + cite_gender.loc[x.strip(),'Color'] + '}{\\cite{'+x.strip()+'}}' for x in cite_instance] for cite_instance in cite_keys]
+            cite_keys_sub = ['\\textsuperscript{,}'.join(x) for x in cite_keys_sub]
+            for idx,cite_instance in enumerate(cite_instances):
+                s = s.replace(cite_instances[idx],cite_keys_sub[idx])
+            fout.write(s)
+            # place color key after abstract
+            if '\\section*{Introduction}\n' in s:            
+                l = ['\\textcolor{' + colors[k] + '}{'+k+'}' for k in colors.keys()]
+                fout.write('\tKey: '+ ', '.join(l)+'.\n')
+
+

From 14c9dc7a0d3582d607dbbfd4e7ad10356505b118 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 23:26:28 -0500
Subject: [PATCH 46/47] update README

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index d73aecd..768dbf6 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ And editorials and research highlights of this work:
 * Z. Budrikis (2020). Growing citation gender gap. *Nature Reviews Physics*. [doi: https://doi.org/10.1038/s42254-020-0207-3](https://doi.org/10.1038/s42254-020-0207-3)
 * D. J. Sweet (2021). New at cell press: the inclusion and diversity statement. *Cell*, 184(1), 1-2. [doi: https://doi.org/10.1016/j.cell.2020.12.019](https://www.sciencedirect.com/science/article/pii/S0092867420316895?via%3Dihub)
 * B. Rowson, S.M. Duma, M.R. King, I. Efimov, A. Saterbak, and N.C. Chesler (2021). Citation diversity statement in BMES journals. *Annals of Biomedical Engineering*, 1-3. [doi: https://doi.org/10.1007/s10439-021-02739-6](https://link.springer.com/article/10.1007/s10439-021-02739-6)
+* D. Kwon (2022). The rise of citational justice: how scholars are making references fairer. *Nature*, 603(7902), 568-571. [doi: https://doi.org/10.1038/d41586-022-00793-1](https://www.nature.com/articles/d41586-022-00793-1)
 
 For `.pdf` and `.tex` templates of the statement, see the `/diversityStatement` directory in this repository.
 
@@ -333,6 +334,7 @@ ___
 * Christopher Camp
 * Eli Cornblath
 * Jordan Dworkin
+* Kieran Murphy
 * Jordan Matelsky
 * Cleanthis Michael
 * Kendra Oudyk
@@ -344,6 +346,15 @@ ___
 * Dale Zhou
 
 # Changelog
+* __11/28/2022__
+  * major refactor (thanks, Jeni!)
+  * removed SOS notebook
+  * upgraded all packages and libraries
+  * all R code now in Python
+  * majority of code now call functions located in utils/
+  * automate removal of identified duplicates and self-citations
+  * query how many credits left and save some by only querying unique names (thanks, Kieran!)
+  * fixes issue introduced by [protobuf upgrade](https://github.com/protocolbuffers/protobuf/issues/10051)
 
 * __9/14/2021__
   * force Binder to load with [classic theme](https://discourse.jupyter.org/t/mybinder-org-using-jupyterlab-by-default/10715) because new default breaks SOS notebook R code

From d6f2b8b4f267de39bb69357bcc3c704ca58d4980 Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Mon, 28 Nov 2022 23:30:09 -0500
Subject: [PATCH 47/47] update launch binder link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 768dbf6..590897f 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ ___
 
 2. Launch the coding environment. Please refresh the page if the Binder does not load after 5-10 mins.
 
-    [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dalejn/cleanBib/34b3a896b6fe0961b2dfc3ad22214cf45da48cca?urlpath=/tree/)
+    [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dalejn/cleanBib/refactor?urlpath=/tree/)
 
 3. Open the notebook `cleanBib.ipynb`. Follow the instructions above each code block. It can take 10 minutes to 1 hour complete all of the instructions, depending on the state and size of your `.bib` file. We expect that the most time-consuming step will be manually modifying the `.bib` file to find missing author names, fill incomplete entries, and fix formatting errors. These problems arise because automated methods of reference mangagers and Google Scholar sometimes can not retrieve full information, for example if some journals only provide an author's first initial instead of their full first name.