From 921f08660dbeab8cdd508d3c8167342429ae8ea0 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 25 Apr 2022 14:57:17 -0400 Subject: [PATCH 01/47] add tests --- tests/outfile.txt | 8 ++ tests/predictions.csv | 35 +++++ tests/testBib_erroneous.bib | 268 +++++++++++++++++++++++++++++++++++ tests/testBib_immaculate.bib | 257 +++++++++++++++++++++++++++++++++ 4 files changed, 568 insertions(+) create mode 100644 tests/outfile.txt create mode 100644 tests/predictions.csv create mode 100644 tests/testBib_erroneous.bib create mode 100644 tests/testBib_immaculate.bib diff --git a/tests/outfile.txt b/tests/outfile.txt new file mode 100644 index 0000000..735da3e --- /dev/null +++ b/tests/outfile.txt @@ -0,0 +1,8 @@ +2.61 0.52 0.07 0.73 2.07 0.05 0.10 0.09 +0.55 0.08 0.01 0.04 0.36 0.36 0.10 0.06 +0.09 0.10 0.00 0.06 0.19 0.01 0.89 0.01 +0.24 0.04 0.01 0.63 0.09 0.02 0.01 0.36 +0.97 0.03 0.01 0.11 2.16 0.02 0.05 0.13 +0.81 0.00 0.01 0.06 0.25 0.00 0.00 0.00 +0.08 0.00 0.00 0.01 0.09 0.00 0.00 0.01 +0.04 0.00 0.00 0.00 0.49 0.00 0.01 0.01 diff --git a/tests/predictions.csv b/tests/predictions.csv new file mode 100644 index 0000000..aeff3e8 --- /dev/null +++ b/tests/predictions.csv @@ -0,0 +1,35 @@ +,CitationKey,Author,Gender,W,A,GendCat +0,buzsaki2013memory,"György,Buzsáki","male,99",0.9489384293556213,0.051061596255749464, +1,buzsaki2013memory,"Edvard,Moser","male,99",0.8077561855316162,0.1922438140027225,malemale +2,Lundine2019,"J,Lundine","unknown,0",0.8451798558235168,0.15482008177787066, +3,Lundine2019,"Dina,Balabanova","female,96",0.8948734998703003,0.10512647964060307,unknownfemale +4,wang2021gendered,"Xinyi,Wang","female,88",0.0034336799290031195,0.9965664306655526, +5,wang2021gendered,"David,Lydon-Staley","male,99",0.9184235334396362,0.0815764528233558,femalemale +6,moralia2005,"William,Plutarch","male,99",0.9850617051124573,0.014938272070139647, +7,moralia2005,"William,Plutarch","male,99",0.9850617051124573,0.014938272070139647,malemale +8,jurafsky2018n,"D,Jurafsky","unknown,0",0.9343108534812927,0.0656891418620944, +9,jurafsky2018n,"JH,Martin","male,82",0.49100935459136963,0.5089906957000494,unknownmale +10,maliniak2013gender,"Daniel,Maliniak","male,99",0.9659098386764526,0.03409015154466033, +11,maliniak2013gender,"Barbara,Walter","female,98",0.9297952651977539,0.07020476355683059,malefemale +12,caplar2017quantitative,"Neven,Caplar","male,75",0.8359426856040955,0.1640572901815176, +13,caplar2017quantitative,"Simon,Birrer","male,98",0.599306046962738,0.4006939800456166,malemale +14,mitchell2013gendered,"Sara,Mitchell","female,98",0.8715987205505371,0.12840125896036625, +15,mitchell2013gendered,"Holly,Brus","female,96",0.942419707775116,0.05758026405237615,femalefemale +16,dion2018gendered,"Michelle,Dion","female,97",0.8750959038734436,0.12490414828062057, +17,dion2018gendered,"Sara,Mitchell","female,98",0.8715987205505371,0.12840125896036625,femalefemale +18,ambekar2009name,"Anurag,Ambekar","male,100",0.2662626802921295,0.7337373411282897, +19,ambekar2009name,"Steven,Skiena","male,99",0.9796334505081177,0.020366581855341792,malemale +20,sood2018predicting,"Gaurav,Sood","male,100",0.6892917156219482,0.31070827692747116, +21,sood2018predicting,"Suriyan,Laohaprapanon","male,96",0.06737928092479706,0.9326208103448153,malemale +22,chatterjee2021gender,"Paula,Chatterjee","female,98",0.41062963008880615,0.5893703922629356, +23,chatterjee2021gender,"Rachel,Werner","female,98",0.9786281585693359,0.02137181058060378,femalefemale +24,fulvio2021imbalance,"Jacqueline,Fulvio","female,98",0.8698657751083374,0.13013425190001726, +25,fulvio2021imbalance,"Bradley,Postle","male,99",0.9850615859031677,0.014938393025659025,femalemale +26,ethnicolr2022black,"Denzel,Washington","male,100",0.004687963519245386,0.9953120946884155, +27,ethnicolr2022black,"Ketanji,Brown-Jackson","unknown,0",0.004336825106292963,0.9956631234381348,maleunknown +28,ethnicolr2022hispanic,"Rafael,Cruz","male,99",0.0451166108250618,0.9548833764856681, +29,ethnicolr2022hispanic,"Alexandria,Ocasio-Cortez","female,97",0.02650504559278488,0.9734949340345338,malefemale +30,ethnicolr2022asian,"Andrew,Wang","male,99",0.07631298899650574,0.923687070608139, +31,ethnicolr2022asian,"Michelle,Yeoh","female,97",0.3952791690826416,0.6047207862138748,malefemale +32,ethnicolr2022white,"Nicolas,Coppola","male,99",0.8281028866767883,0.17189706675708294, +33,ethnicolr2022white,"Meryl,Streep","female,91",0.9587977528572083,0.04120224388316274,malefemale diff --git a/tests/testBib_erroneous.bib b/tests/testBib_erroneous.bib new file mode 100644 index 0000000..2bcc596 --- /dev/null +++ b/tests/testBib_erroneous.bib @@ -0,0 +1,268 @@ +@article{buzsaki2013memory, + title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system}, + author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{Lundine2019, + + abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.}, + + author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina}, + + doi = {10.1016/j.socscimed.2019.112388}, + + file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf}, + + issn = {18735347}, + + journal = {Social Science and Medicine}, + + keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing}, + + number = {January}, + + pages = {112388}, + + pmid = {31288167}, + + publisher = {Elsevier}, + + title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}}, + + url = {https://doi.org/10.1016/j.socscimed.2019.112388}, + + volume = {235}, + + year = {2019} + +} + +@article{wang2021gendered, + + author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.}, + + year = {2021}, + + title = {Gendered citation practices in the field of communication}, + + journal = {Annals of the International Communication Association}, + + doi = {10.1080/23808985.2021.1960180}, +} + +@article{zurn2020network, + title={Network architectures supporting learnability}, + author={Zurn, Perry and Bassett, Danielle S}, + journal={Philosophical Transactions of the Royal Society B}, + volume={375}, + number={1796}, + pages={20190323}, + year={2020}, + publisher={The Royal Society} +} + +@article{zurn2020network, + title={Network architectures supporting learnability}, + author={Zurn, Perry and Bassett, Danielle S}, + journal={Philosophical Transactions of the Royal Society B}, + volume={375}, + number={1796}, + pages={20190323}, + year={2020}, + publisher={The Royal Society} +} + +@book{moralia2005, + title={Moralia, Volume VI}, + author={Plutarch, Helmbold, William}, + year={1939}, + publisher={Harvard University Press} +} + +@book{bassett2022curious, +title={Curious Minds}, +author={Danielle S. Bassett and Perry Zurn}, +publisher={MIT Press}, +year={2022}, +} + +@book{fake2022, +title={fake}, +author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso}, +publisher={MIT Press}, +year={2022}, +} + +@article{jurafsky2018n, + title={N-gram language models}, + author={Jurafsky, D and Martin, JH}, + journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition}, + year={2018} +} + +@article {Dworkin2020.01.03.894378, + author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.}, + title = {The extent and drivers of gender imbalance in neuroscience reference lists}, + elocation-id = {2020.01.03.894378}, + year = {2020}, + doi = {10.1101/2020.01.03.894378}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.}, + URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378}, + eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf}, + journal = {bioRxiv} +} + +@article{maliniak2013gender, + title={The gender citation gap in international relations}, + author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F}, + journal={International Organization}, + volume={67}, + number={4}, + pages={889--922}, + year={2013}, + publisher={Cambridge University Press} +} + +@article{caplar2017quantitative, + title={Quantitative evaluation of gender bias in astronomical publications from citation counts}, + author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon}, + journal={Nature Astronomy}, + volume={1}, + number={6}, + pages={0141}, + year={2017}, + publisher={Nature Publishing Group} +} + +@article{mitchell2013gendered, + title={Gendered citation patterns in international relations journals}, + author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly}, + journal={International Studies Perspectives}, + volume={14}, + number={4}, + pages={485--492}, + year={2013}, + publisher={Blackwell Publishing Ltd Oxford, UK} +} + +@article{dion2018gendered, + title={Gendered citation patterns across political science and social science methodology fields}, + author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin}, + journal={Political Analysis}, + volume={26}, + number={3}, + pages={312--327}, + year={2018}, + publisher={Cambridge University Press} +} + +@software{zhou_dale_2020_3672110, + author = {Zhou, Dale and + Cornblath, Eli J. and + Stiso, Jennifer and + Teich, Erin G. and + Dworkin, Jordan D. and + Blevins, Ann S. and + Bassett, Danielle S.}, + title = {Gender Diversity Statement and Code Notebook v1.0}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {v1.0}, + doi = {10.5281/zenodo.3672110}, + url = {https://doi.org/10.5281/zenodo.3672110} +} + +@inproceedings{ambekar2009name, + title={Name-ethnicity classification from open sources}, + author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven}, + booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining}, + pages={49--58}, + year={2009} +} + +@article{sood2018predicting, + title={Predicting race and ethnicity from the sequence of characters in a name}, + author={Sood, Gaurav and Laohaprapanon, Suriyan}, + journal={arXiv preprint arXiv:1805.02109}, + year={2018} +} + +@article{bertolero2021racial, +title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender}, +author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.}, +journal = {bioRxiv}, +year = {2020}, +xoi = {10.1101/2020.10.12.336230}, +} + +@article{chatterjee2021gender, +journal = {JAMA Netw Open}, +year = {2021}, +volume = {4}, +number = {7}, +pages = {e2114509}, +title = {Gender Disparity in Citations in High-Impact Journal Articles}, +author = {Chatterjee, Paula and Werner, Rachel M}, +} + +@article{fulvio2021imbalance, +title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience}, +author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R}, +journal = {J Cogn Neurosci}, +year = {2021}, +volume = {33}, +number = {1}, +pages = {3-7}, +} + +@article{ethnicolr2022black, + title={Test of ethnicolr}, + author={Washington, Denzel and Brown-Jackson, Ketanji}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022hispanic, + title={Test of ethnicolr}, + author={Cruz, Rafael and Ocasio-Cortez, Alexandria}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022asian, + title={Test of ethnicolr}, + author={Wang, Andrew and Yeoh, Michelle}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022white, + title={Test of ethnicolr}, + author={Coppola, Nicolas and Streep, Meryl}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} \ No newline at end of file diff --git a/tests/testBib_immaculate.bib b/tests/testBib_immaculate.bib new file mode 100644 index 0000000..039fa4c --- /dev/null +++ b/tests/testBib_immaculate.bib @@ -0,0 +1,257 @@ +@article{buzsaki2013memory, + title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system}, + author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{Lundine2019, + + abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.}, + + author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina}, + + doi = {10.1016/j.socscimed.2019.112388}, + + file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf}, + + issn = {18735347}, + + journal = {Social Science and Medicine}, + + keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing}, + + number = {January}, + + pages = {112388}, + + pmid = {31288167}, + + publisher = {Elsevier}, + + title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}}, + + url = {https://doi.org/10.1016/j.socscimed.2019.112388}, + + volume = {235}, + + year = {2019} + +} + +@article{wang2021gendered, + + author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.}, + + year = {2021}, + + title = {Gendered citation practices in the field of communication}, + + journal = {Annals of the International Communication Association}, + + doi = {10.1080/23808985.2021.1960180}, +} + +@article{zurn2020network, + title={Network architectures supporting learnability}, + author={Zurn, Perry and Bassett, Danielle S}, + journal={Philosophical Transactions of the Royal Society B}, + volume={375}, + number={1796}, + pages={20190323}, + year={2020}, + publisher={The Royal Society} +} + +@book{moralia2005, + title={Moralia, Volume VI}, + author={Plutarch, Helmbold, William}, + year={1939}, + publisher={Harvard University Press} +} + +@book{bassett2022curious, +title={Curious Minds}, +author={Danielle S. Bassett and Perry Zurn}, +publisher={MIT Press}, +year={2022}, +} + +@book{fake2022, +title={fake}, +author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso}, +publisher={MIT Press}, +year={2022}, +} + +@article{jurafsky2018n, + title={N-gram language models}, + author={Jurafsky, D and Martin, JH}, + journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition}, + year={2018} +} + +@article {Dworkin2020.01.03.894378, + author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.}, + title = {The extent and drivers of gender imbalance in neuroscience reference lists}, + elocation-id = {2020.01.03.894378}, + year = {2020}, + doi = {10.1101/2020.01.03.894378}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.}, + URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378}, + eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf}, + journal = {bioRxiv} +} + +@article{maliniak2013gender, + title={The gender citation gap in international relations}, + author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F}, + journal={International Organization}, + volume={67}, + number={4}, + pages={889--922}, + year={2013}, + publisher={Cambridge University Press} +} + +@article{caplar2017quantitative, + title={Quantitative evaluation of gender bias in astronomical publications from citation counts}, + author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon}, + journal={Nature Astronomy}, + volume={1}, + number={6}, + pages={0141}, + year={2017}, + publisher={Nature Publishing Group} +} + +@article{mitchell2013gendered, + title={Gendered citation patterns in international relations journals}, + author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly}, + journal={International Studies Perspectives}, + volume={14}, + number={4}, + pages={485--492}, + year={2013}, + publisher={Blackwell Publishing Ltd Oxford, UK} +} + +@article{dion2018gendered, + title={Gendered citation patterns across political science and social science methodology fields}, + author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin}, + journal={Political Analysis}, + volume={26}, + number={3}, + pages={312--327}, + year={2018}, + publisher={Cambridge University Press} +} + +@software{zhou_dale_2020_3672110, + author = {Zhou, Dale and + Cornblath, Eli J. and + Stiso, Jennifer and + Teich, Erin G. and + Dworkin, Jordan D. and + Blevins, Ann S. and + Bassett, Danielle S.}, + title = {Gender Diversity Statement and Code Notebook v1.0}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {v1.0}, + doi = {10.5281/zenodo.3672110}, + url = {https://doi.org/10.5281/zenodo.3672110} +} + +@inproceedings{ambekar2009name, + title={Name-ethnicity classification from open sources}, + author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven}, + booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining}, + pages={49--58}, + year={2009} +} + +@article{sood2018predicting, + title={Predicting race and ethnicity from the sequence of characters in a name}, + author={Sood, Gaurav and Laohaprapanon, Suriyan}, + journal={arXiv preprint arXiv:1805.02109}, + year={2018} +} + +@article{bertolero2021racial, +title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender}, +author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.}, +journal = {bioRxiv}, +year = {2020}, +xoi = {10.1101/2020.10.12.336230}, +} + +@article{chatterjee2021gender, +journal = {JAMA Netw Open}, +year = {2021}, +volume = {4}, +number = {7}, +pages = {e2114509}, +title = {Gender Disparity in Citations in High-Impact Journal Articles}, +author = {Chatterjee, Paula and Werner, Rachel M}, +} + +@article{fulvio2021imbalance, +title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience}, +author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R}, +journal = {J Cogn Neurosci}, +year = {2021}, +volume = {33}, +number = {1}, +pages = {3-7}, +} + +@article{ethnicolr2022black, + title={Test of ethnicolr}, + author={Washington, Denzel and Brown-Jackson, Ketanji}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022hispanic, + title={Test of ethnicolr}, + author={Cruz, Rafael and Ocasio-Cortez, Alexandria}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022asian, + title={Test of ethnicolr}, + author={Wang, Andrew and Yeoh, Michelle}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022white, + title={Test of ethnicolr}, + author={Coppola, Nicolas and Streep, Meryl}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} \ No newline at end of file From 5ac8bef9e91eeaf7cc3b103e17b87a561d5288d4 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Mon, 25 Apr 2022 15:10:25 -0400 Subject: [PATCH 02/47] added existing fucntions to scripts --- utils/__init__.py | 0 utils/preprocessing.py | 55 ++++++++++++++++++++++++++++++++++++++++++ utils/queries.py | 51 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 utils/__init__.py create mode 100644 utils/preprocessing.py create mode 100644 utils/queries.py diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/preprocessing.py b/utils/preprocessing.py new file mode 100644 index 0000000..10933ac --- /dev/null +++ b/utils/preprocessing.py @@ -0,0 +1,55 @@ +def checkcites_output(aux_file): + '''take in aux file for tex document, return list of citation keys + that are in .bib file but not in document''' + + result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + unused_array_raw = result.split('\n') + # process array of unused references + other output + unused_array_final = list() + for x in unused_array_raw: + if len(x) > 0: # if line is not empty + if x[0] == '-': # and if first character is a '-', it's a citation key + unused_array_final.append(x[2:]) # truncate '- ' + if "------------------------------------------------------------------------" in unused_array_final: + return(result) + else: + return(unused_array_final) + + +def removeMiddleName(line): + arr = line.split() + last = arr.pop() + n = len(arr) + if n == 4: + first, middle = ' '.join(arr[:2]), ' '.join(arr[2:]) + elif n == 3: + first, middle = arr[0], ' '.join(arr[1:]) + elif n == 2: + first, middle = arr + elif n==1: + return line + return(str(first + ' ' + middle)) + + +def returnFirstName(line): + arr = line.split() + n = len(arr) + if n == 4: + first, middle = ' '.join(arr[:2]), ' '.join(arr[2:]) + elif n == 3: + first, middle = arr[0], ' '.join(arr[1:]) + elif n == 2: + first, middle = arr + elif n==1: + return line + return(str(middle)) + + +def convertLatexSpecialChars(latex_text): + return LatexNodes2Text().latex_to_text(latex_text) + + +def convertSpecialCharsToUTF8(text): + data = LatexNodes2Text().latex_to_text(text) + return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8') diff --git a/utils/queries.py b/utils/queries.py new file mode 100644 index 0000000..7d0987e --- /dev/null +++ b/utils/queries.py @@ -0,0 +1,51 @@ +def namesFromXref(doi, title, authorPos): + '''Use DOI and article titles to query Crossref for author list''' + if authorPos == 'first': + idx = 0 + elif authorPos == 'last': + idx = -1 + # get cross ref data + authors = [''] + # first try DOI + if doi != "": + works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi}) + if works['message']['total-results'] > 0: + authors = works['message']['items'][0]['author'] + elif title != '': + works = cr.works(query=f'title:"{title}"', select=["title", "author"], limit=10) + cnt = 0 + name = '' + # check that you grabbed the proper paper + if works['message']['items'][cnt]['title'][0].lower() == title.lower(): + authors = works['message']['items'][0]['author'] + + # check the all fields are available + if not 'given' in authors[idx]: + name = '' + else: + # trim initials + name = authors[idx]['given'].replace('.', ' ').split()[0] + + return name + + +def namesFromXrefSelfCite(doi, title): + selfCiteCheck = 0 + # get cross ref data + authors = [''] + # first try DOI + if doi != "": + works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi}) + if works['message']['total-results'] > 0: + authors = works['message']['items'][0]['author'] + + for i in authors: + if i != "": + first = i['given'].replace('.', ' ').split()[0] + last = i['family'].replace('.', ' ').split()[0] + authors = removeMiddleName(last + ", " + first) + if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName( + convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName( + yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)): + selfCiteCheck += 1 + return selfCiteCheck From 9470d693511a2ead2b48d0dbf81026240a52ed17 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 25 Apr 2022 15:13:52 -0400 Subject: [PATCH 03/47] get rid of r files --- environment.yaml | 2 -- install.R | 2 -- postBuild | 1 - requirements.txt | 2 -- 4 files changed, 7 deletions(-) delete mode 100644 install.R delete mode 100644 postBuild diff --git a/environment.yaml b/environment.yaml index f6d8b41..e1d9167 100644 --- a/environment.yaml +++ b/environment.yaml @@ -16,8 +16,6 @@ dependencies: - pandas - re - pylatexenc - - sos - - sos-notebook - habanero - tqdm - json diff --git a/install.R b/install.R deleted file mode 100644 index 87f4b40..0000000 --- a/install.R +++ /dev/null @@ -1,2 +0,0 @@ -install.packages('rjson') -install.packages('ggplot2') \ No newline at end of file diff --git a/postBuild b/postBuild deleted file mode 100644 index 279de15..0000000 --- a/postBuild +++ /dev/null @@ -1 +0,0 @@ -python3 -m sos_notebook.install diff --git a/requirements.txt b/requirements.txt index 1bc7eef..a4ffc94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,6 @@ numpy==1.19.2 bibtexparser==1.1.0 pandas==1.1.3 pylatexenc==2.1 -sos==0.21.5 -sos-notebook==0.21.7 habanero==0.7.2 ethnicolr==0.4.0 matplotlib==3.3.2 From 2513e4eb7ee3fc69f28339d85d6210c469f0505f Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Mon, 25 Apr 2022 15:16:53 -0400 Subject: [PATCH 04/47] added imports to function scripts --- utils/preprocessing.py | 33 +++++++++++++++++++++++++++++---- utils/queries.py | 21 +-------------------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/utils/preprocessing.py b/utils/preprocessing.py index 10933ac..09f5224 100644 --- a/utils/preprocessing.py +++ b/utils/preprocessing.py @@ -1,3 +1,7 @@ +import subprocess +from pylatexenc.latex2text import LatexNodes2Text +import unicodedata + def checkcites_output(aux_file): '''take in aux file for tex document, return list of citation keys that are in .bib file but not in document''' @@ -12,9 +16,9 @@ def checkcites_output(aux_file): if x[0] == '-': # and if first character is a '-', it's a citation key unused_array_final.append(x[2:]) # truncate '- ' if "------------------------------------------------------------------------" in unused_array_final: - return(result) + return result else: - return(unused_array_final) + return unused_array_final def removeMiddleName(line): @@ -29,7 +33,7 @@ def removeMiddleName(line): first, middle = arr elif n==1: return line - return(str(first + ' ' + middle)) + return str(first + ' ' + middle) def returnFirstName(line): @@ -43,7 +47,7 @@ def returnFirstName(line): first, middle = arr elif n==1: return line - return(str(middle)) + return str(middle) def convertLatexSpecialChars(latex_text): @@ -53,3 +57,24 @@ def convertLatexSpecialChars(latex_text): def convertSpecialCharsToUTF8(text): data = LatexNodes2Text().latex_to_text(text) return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8') + +def namesFromXrefSelfCite(doi, title): + selfCiteCheck = 0 + # get cross ref data + authors = [''] + # first try DOI + if doi != "": + works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi}) + if works['message']['total-results'] > 0: + authors = works['message']['items'][0]['author'] + + for i in authors: + if i != "": + first = i['given'].replace('.', ' ').split()[0] + last = i['family'].replace('.', ' ').split()[0] + authors = removeMiddleName(last + ", " + first) + if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName( + convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName( + yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)): + selfCiteCheck += 1 + return selfCiteCheck \ No newline at end of file diff --git a/utils/queries.py b/utils/queries.py index 7d0987e..7cac1ac 100644 --- a/utils/queries.py +++ b/utils/queries.py @@ -1,4 +1,4 @@ -def namesFromXref(doi, title, authorPos): +def namesFromXref(cr, doi, title, authorPos): '''Use DOI and article titles to query Crossref for author list''' if authorPos == 'first': idx = 0 @@ -29,23 +29,4 @@ def namesFromXref(doi, title, authorPos): return name -def namesFromXrefSelfCite(doi, title): - selfCiteCheck = 0 - # get cross ref data - authors = [''] - # first try DOI - if doi != "": - works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi}) - if works['message']['total-results'] > 0: - authors = works['message']['items'][0]['author'] - for i in authors: - if i != "": - first = i['given'].replace('.', ' ').split()[0] - last = i['family'].replace('.', ' ').split()[0] - authors = removeMiddleName(last + ", " + first) - if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName( - convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName( - yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)): - selfCiteCheck += 1 - return selfCiteCheck From ff38c5010ca2eedc83e7efa25728537819ff1a23 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 2 May 2022 14:49:22 -0400 Subject: [PATCH 05/47] add test aux --- tests/document.aux | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/document.aux diff --git a/tests/document.aux b/tests/document.aux new file mode 100644 index 0000000..ed771f5 --- /dev/null +++ b/tests/document.aux @@ -0,0 +1,45 @@ +\relax +\citation{Lundine2019} +\citation{wang2021gendered} +\citation{zurn2020network} +\citation{moralia2005} +\citation{bassett2022curious} +\citation{fake2022} +\citation{Dworkin2020.01.03.894378} +\citation{maliniak2013gender} +\citation{caplar2017quantitative} +\citation{mitchell2013gendered} +\citation{dion2018gendered} +\citation{zhou_dale_2020_3672110} +\citation{ambekar2009name} +\citation{sood2018predicting} +\citation{bertolero2021racial} +\citation{chatterjee2021gender} +\citation{fulvio2021imbalance} +\citation{ethnicolr2022black} +\citation{ethnicolr2022hispanic} +\citation{ethnicolr2022asian} +\citation{ethnicolr2022white} +\bibstyle{ieeetr} +\bibdata{testBib_immaculate.bib} +\bibcite{Lundine2019}{1} +\bibcite{wang2021gendered}{2} +\bibcite{zurn2020network}{3} +\bibcite{moralia2005}{4} +\bibcite{bassett2022curious}{5} +\bibcite{fake2022}{6} +\bibcite{Dworkin2020.01.03.894378}{7} +\bibcite{maliniak2013gender}{8} +\bibcite{caplar2017quantitative}{9} +\bibcite{mitchell2013gendered}{10} +\bibcite{dion2018gendered}{11} +\bibcite{zhou_dale_2020_3672110}{12} +\bibcite{ambekar2009name}{13} +\bibcite{sood2018predicting}{14} +\bibcite{bertolero2021racial}{15} +\bibcite{chatterjee2021gender}{16} +\bibcite{fulvio2021imbalance}{17} +\bibcite{ethnicolr2022black}{18} +\bibcite{ethnicolr2022hispanic}{19} +\bibcite{ethnicolr2022asian}{20} +\bibcite{ethnicolr2022white}{21} From 22bc19ebef833a52ba2d2b710e298d290e78a7c0 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Mon, 2 May 2022 15:08:47 -0400 Subject: [PATCH 06/47] breaking first cell into fn units --- .gitignore | 1 + requirements.txt | 136 ++++++++++++-- utils/preprocessing.py | 397 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 518 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 10346f0..c7b6bf3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store .ipynb_checkpoints/* +env/* diff --git a/requirements.txt b/requirements.txt index a4ffc94..ff17b13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,121 @@ -pybtex==0.22.2 -numpy==1.19.2 -bibtexparser==1.1.0 -pandas==1.1.3 -pylatexenc==2.1 -habanero==0.7.2 -ethnicolr==0.4.0 -matplotlib==3.3.2 -seaborn==0.11.0 -scipy==1.5.3 -h5py==2.10.0 -oauthlib==3.0.1 -rsa==4.7 -Keras==2.2.4 -tensorflow==1.15.2 \ No newline at end of file +absl-py==0.15.0 +appnope==0.1.3 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.5 +astunparse==1.6.3 +attrs==21.4.0 +backcall==0.2.0 +beautifulsoup4==4.11.1 +bibtexparser==1.2.0 +bleach==5.0.0 +cachetools==5.0.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.12 +cycler==0.11.0 +debugpy==1.6.0 +decorator==5.1.1 +defusedxml==0.7.1 +entrypoints==0.4 +ethnicolr==0.8.1 +executing==0.8.3 +fastjsonschema==2.15.3 +flatbuffers==1.12 +fonttools==4.33.3 +future==0.18.2 +gast==0.4.0 +google-auth==2.6.6 +google-auth-oauthlib==0.4.6 +google-pasta==0.2.0 +grpcio==1.34.1 +h5py==3.1.0 +habanero==1.2.0 +idna==3.3 +importlib-metadata==4.11.3 +ipykernel==6.13.0 +ipython==8.3.0 +ipython-genutils==0.2.0 +ipywidgets==7.7.0 +jedi==0.18.1 +Jinja2==3.1.2 +jsonschema==4.4.0 +jupyter==1.0.0 +jupyter-client==7.3.0 +jupyter-console==6.4.3 +jupyter-core==4.10.0 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==1.1.0 +keras==2.8.0 +keras-nightly==2.5.0.dev2021032900 +Keras-Preprocessing==1.1.2 +kiwisolver==1.4.2 +latexcodec==2.0.1 +Markdown==3.3.6 +MarkupSafe==2.1.1 +matplotlib==3.5.1 +matplotlib-inline==0.1.3 +mistune==0.8.4 +nbclient==0.6.0 +nbconvert==6.5.0 +nbformat==5.3.0 +nest-asyncio==1.5.5 +notebook==6.4.11 +numpy==1.19.5 +oauthlib==3.2.0 +opt-einsum==3.3.0 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.1.0 +prometheus-client==0.14.1 +prompt-toolkit==3.0.29 +protobuf==3.20.1 +psutil==5.9.0 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pybtex==0.24.0 +pycparser==2.21 +Pygments==2.12.0 +pylatexenc==2.10 +pyparsing==3.0.8 +pyrsistent==0.18.1 +python-dateutil==2.8.2 +pytz==2022.1 +PyYAML==6.0 +pyzmq==22.3.0 +qtconsole==5.3.0 +QtPy==2.1.0 +requests==2.27.1 +requests-oauthlib==1.3.1 +rsa==4.8 +scipy==1.8.0 +seaborn==0.11.2 +Send2Trash==1.8.0 +six==1.15.0 +soupsieve==2.3.2.post1 +stack-data==0.2.0 +tensorboard==2.9.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +tensorflow==2.5.2 +tensorflow-estimator==2.5.0 +termcolor==1.1.0 +terminado==0.13.3 +tinycss2==1.1.1 +tornado==6.1 +tqdm==4.64.0 +traitlets==5.1.1 +typing-extensions==3.7.4.3 +urllib3==1.26.9 +wcwidth==0.2.5 +webencodings==0.5.1 +Werkzeug==2.1.2 +widgetsnbextension==3.6.0 +wrapt==1.12.1 +zipp==3.8.0 diff --git a/utils/preprocessing.py b/utils/preprocessing.py index 09f5224..890a056 100644 --- a/utils/preprocessing.py +++ b/utils/preprocessing.py @@ -20,8 +20,34 @@ def checkcites_output(aux_file): else: return unused_array_final +def clean_name(name, flag): + """ + + :param name: + flag: utf or latex + :return: clean_name + """ + if flag=='latex': + clean_name = convertLatexSpecialChars(str(name)[7:-3]).replace( + "', Protected('", "" + ).replace( + "'), '", "" + ) + elif flag=='utf': + clean_name = convertSpecialCharsToUTF8(str(name)[7:-3]).replace( + "', Protected('", "" + ).replace( + "'), '", "" + ) + else: + raise ValueError def removeMiddleName(line): + """ + + :param line: + :return: + """ arr = line.split() last = arr.pop() n = len(arr) @@ -37,6 +63,11 @@ def removeMiddleName(line): def returnFirstName(line): + """ + + :param line: + :return: + """ arr = line.split() n = len(arr) if n == 4: @@ -51,14 +82,30 @@ def returnFirstName(line): def convertLatexSpecialChars(latex_text): + """ + + :param latex_text: + :return: + """ return LatexNodes2Text().latex_to_text(latex_text) def convertSpecialCharsToUTF8(text): + """ + + :param text: + :return: + """ data = LatexNodes2Text().latex_to_text(text) return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8') def namesFromXrefSelfCite(doi, title): + """ + + :param doi: + :param title: + :return: + """ selfCiteCheck = 0 # get cross ref data authors = [''] @@ -77,4 +124,352 @@ def namesFromXrefSelfCite(doi, title): convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName( yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)): selfCiteCheck += 1 - return selfCiteCheck \ No newline at end of file + return selfCiteCheck + + +def find_unused_cites(paper_aux_file): + """ + + :param paper_aux_file: path to auxfile + :return: + """ + print(checkcites_output(paper_aux_file)) + unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper + print("Unused citations: ", unused_in_paper.count('=>')) + +def get_bib_data(homedir): + """ + + :param homedir: home directory + :return: bib_data + """ + ID = glob.glob(homedir + '*bib') + with open(ID[0]) as bibtex_file: + bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, + ignore_nonstandard_types=False).parse_file(bibtex_file) + return bib_data + +def get_duplicates(bib_data): + """ + take bib_data, and get duplicates + :param homedir: home directory + :return: + """ + + duplicates = [] + for key in bib_data.entries_dict.keys(): + count = str(bib_data.entries).count("'ID\': \'" + key + "\'") + if count > 1: + duplicates.append(key) + + if len(duplicates) > 0: + raise ValueError("In your .bib file, we found and removed duplicate entries for:", + ' '.join(map(str, duplicates))) + + +def get_names_published(outPath, bib_data): + """ + whole pipeline for published papers + :return: FA, + LA + """ + FA = [] + LA = [] + counter = 1 + selfCiteCount = 0 + titleCount = 1 # + counterNoDOI = list() # row index (titleCount) of entries with no DOI + outPath = homedir + 'cleanedBib.csv' + + if os.path.exists(outPath): + os.remove(outPath) + + with open(outPath, 'w', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey']) + + citedArticleDOI = list() + citedArticleNoDOI = list() + allArticles = list() + for entry in bib_data.entries: + my_string = entry['cited-references'].split('\n') + for citedArticle in my_string: + allArticles.append(citedArticle) + if citedArticle.partition("DOI ")[-1] == '': + citedArticleNoDOI.append(citedArticle) + counterNoDOI.append(titleCount) + else: + line = citedArticle.partition("DOI ")[-1].replace("DOI ", "").rstrip(".") + line = ''.join(c for c in line if c not in '{[}] ') + if "," in line: + line = line.partition(",")[-1] + citedArticleDOI.append(line) + with open('citedArticlesDOI.csv', 'a', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',') + writer.writerow([line]) + titleCount += 1 + + articleNum = 0 + for doi in citedArticleDOI: + try: + FA = namesFromXref(doi, '', 'first') + except UnboundLocalError: + sleep(1) + continue + + try: + LA = namesFromXref(doi, '', 'last') + except UnboundLocalError: + sleep(1) + continue + + try: + selfCiteCount = namesFromXrefSelfCite(doi, '') + except UnboundLocalError: + sleep(1) + continue + + with open(outPath, 'a', newline='') as csvfile: + if selfCiteCount == 0: + writer = csv.writer(csvfile, delimiter=',') + getArticleIndex = [i for i, s in enumerate(allArticles) if doi in s] + writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), + allArticles[[i for i, s in enumerate(allArticles) if doi in s][0]], '', '']) + print(str(counter) + ": " + doi) + counter += 1 + else: + print(str(articleNum) + ": " + doi + "\t\t\t <-- self-citation") + articleNum += 1 + + if len(citedArticleNoDOI) > 0: + print() + for elem in citedArticleNoDOI: + with open(outPath, 'a', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',') + writer.writerow([counter, '', '', elem, '', '']) + print(str(counter) + ": " + elem) + counter += 1 + print() + raise ValueError("WARNING: No article DOI was provided for the last " + str( + len(citedArticleNoDOI)) + " listed papers. Please manually search for these articles. IF AND ONLY IF your citing paper's first and last author are not co-authors in the paper that was cited, enter the first name of the first and last authors of the paper that was cited manually. Then, continue to the next code block.") + + return FA, LA + + +def get_names(bib_data): + """ + take bib_data, and get lists of first and last names. should also get self cites and CDS cites + :return: FA + LA + """ + counter = 1 + nameCount = 0 + outPath = homedir + 'cleanedBib.csv' + + if os.path.exists(outPath): + os.remove(outPath) + + with open(outPath, 'w', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey']) + + for key in bib_data.entries.keys(): + diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists', + 'The gender citation gap in international relations', + 'Quantitative evaluation of gender bias in astronomical publications from citation counts', + '\# CommunicationSoWhite', + '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}', + 'Gendered citation patterns across political science and social science methodology fields', + 'Gender Diversity Statement and Code Notebook v1.0', + 'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender', + 'Gender Diversity Statement and Code Notebook v1.1', + 'Gendered citation practices in the field of communication', + 'Gender disparity in citations in high- impact journal articles', + 'Gender (im)balance in citation practices in cognitive neuroscience', + 'Name-ethnicity classification from open sources', + 'Predicting race and ethnicity from the sequence of characters in a name'] + if bib_data.entries[key].fields['title'] in diversity_bib_titles: + continue + + try: + author = bib_data.entries[key].persons['author'] + except: + author = bib_data.entries[key].persons['editor'] + FA = author[0].rich_first_names + LA = author[-1].rich_first_names + FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace( + 'Protected', "").replace(" ", '') + LA = convertLatexSpecialChars(str(LA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace( + 'Protected', "").replace(" ", '') + + # check if we grabbed a first initial when a full middle name was available + if (len(FA) == 1): + mn = author[0].rich_middle_names + mn = convertLatexSpecialChars(str(mn)[7:-3]).translate( + str.maketrans('', '', string.punctuation)).replace('Protected', "").replace(" ", '') + if len(mn) > 1: + FA = mn + if (len(LA) == 1): + mn = author[-1].rich_middle_names + mn = convertLatexSpecialChars(str(mn)[7:-3]).translate( + str.maketrans('', '', string.punctuation)).replace('Protected', "").replace(" ", '') + if len(mn) > 1: + LA = mn + + # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API + try: + title = bib_data.entries[key].fields['title'].replace(',', '').\ + replace(',', '').replace('{', '').replace('}','') + except: + title = '' + try: + doi = bib_data.entries[key].fields['doi'] + except: + doi = '' + if FA == '' or len(FA.split('.')[0]) <= 1: + while True: + try: + FA = namesFromXref(doi, title, 'first') + except UnboundLocalError: + sleep(1) + continue + break + if LA == '' or len(LA.split('.')[0]) <= 1: + while True: + try: + LA = namesFromXref(doi, title, 'last') + except UnboundLocalError: + sleep(1) + continue + break + + self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors) + counter += 1 + with open(outPath, 'a', newline='') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) + writer.writerow( + [counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key]) + + +def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors): + """ + take author list, and find self citations + + :param author: + :param yourFirstAuthor: + :param yourLastAuthor: + :param optionalEqualContributors: + :return: + """ + if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or ( + yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'): + raise ValueError("Please enter your manuscript's first and last author names") + + selfCiteCheck1 = [s for s in author if removeMiddleName(yourLastAuthor) in + str( + [clean_name(s.rich_last_names, 'latex'), + clean_name(s.rich_first_names, 'latex')] + ).replace("'", "")] + + selfCiteCheck1a = [s for s in author if removeMiddleName(yourLastAuthor) in + str( + [clean_name(s.rich_last_names, 'utf'), + clean_name(s.rich_first_names, 'utf')] + ).replace("'", "")] + selfCiteCheck1b = [s for s in author if removeMiddleName(yourLastAuthor) in + str( + [clean_name(s.rich_last_names, 'utf'), + LA]).replace("'","")] + # I was in the process of cleaning all thisup when we stopped + selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ + convertLatexSpecialChars( + str(s.rich_last_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", ""), + convertLatexSpecialChars( + str(s.rich_first_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", + "")]).replace( + "'", "")] + selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ + convertSpecialCharsToUTF8( + str(s.rich_last_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", ""), + convertSpecialCharsToUTF8( + str(s.rich_first_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", + "")]).replace( + "'", "")] + selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ + convertSpecialCharsToUTF8( + str(s.rich_last_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", ""), + FA]).replace("'", + "")] + + nameCount = 0 + if optionalEqualContributors != ( + 'LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'): + for name in optionalEqualContributors: + selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars( + str(s.rich_last_names)[7:-3]).replace("', Protected('", "").replace("'), '", ""), + convertLatexSpecialChars( + str(s.rich_first_names)[ + 7:-3]).replace( + "', Protected('", + "").replace("'), '", + "")]).replace( + "'", "")] + selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([ + convertSpecialCharsToUTF8( + str(s.rich_last_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", ""), + convertSpecialCharsToUTF8( + str(s.rich_first_names)[ + 7:-3]).replace( + "', Protected('", + "").replace( + "'), '", + "")]).replace("'", + "")] + if len(selfCiteCheck3) > 0: + nameCount += 1 + if len(selfCiteCheck3a) > 0: + nameCount += 1 + selfCiteChecks = [selfCiteCheck1, selfCiteCheck1a, selfCiteCheck1b, selfCiteCheck2, selfCiteCheck2a, + selfCiteCheck2b] + if sum([len(check) for check in selfCiteChecks]) + nameCount > 0: + selfCite = 'Y' + if len(FA) < 2: + print( + str(counter) + ": " + key + "\t\t <-- self-citation <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***") + else: + print(str(counter) + ": " + key + " <-- self-citation") + else: + selfCite = 'N' + if len(FA) < 2: + print(str(counter) + ": " + key + "\t\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***") + else: + print(str(counter) + ": " + key) + + + + + From eec793ed3b6b5ea790d9a4e5df8f1451521cca59 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Mon, 9 May 2022 16:19:32 -0400 Subject: [PATCH 07/47] added pipeline and got imports working --- tests/__init__.py | 0 tests/pipeline.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/pipeline.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pipeline.py b/tests/pipeline.py new file mode 100644 index 0000000..f0e1e70 --- /dev/null +++ b/tests/pipeline.py @@ -0,0 +1,37 @@ +import glob +from habanero import Crossref +import sys +import os +wd = os.getcwd() +print(f'{wd[0:-6]}/utils') +print(f'{wd[0:-6]}/utils') +sys.path.insert(1, f'{wd[0:-6]}/utils') +from preprocessing import * + +cr = Crossref() +homedir = '/home/jovyan/' +bib_files = glob.glob(homedir + '*.bib') +paper_aux_file = glob.glob(homedir + '*.aux') +paper_bib_file = 'library_paper.bib' +try: + tex_file = glob.glob(homedir + "*.tex")[0] +except: + print('No optional .tex file found.') + +yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial' +yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial' +optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'] +checkingPublishedArticle = False + +## end of user input +if paper_aux_file: + find_unused_cites(paper_aux_file) + +bib_data = get_bib_data(homedir) +if checkingPublishedArticle: + FA,LA = get_names_published(homedir, bib_data) +else: + # find and print duplicates + get_duplicates(bib_data) + # get names, remove CDS, find self cites + FA,LA = get_names(bib_data) \ No newline at end of file From 65465c9f94be824e0e43817afa8e2543fcffabca Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Tue, 10 May 2022 14:31:46 -0400 Subject: [PATCH 08/47] added needed imports and arguments for 1st cell functions --- tests/immaculate/cleanedBib.csv | 15 ++ tests/immaculate/pipeline.py | 37 ++++ tests/immaculate/testBib_immaculate.bib | 257 ++++++++++++++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 tests/immaculate/cleanedBib.csv create mode 100644 tests/immaculate/pipeline.py create mode 100644 tests/immaculate/testBib_immaculate.bib diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv new file mode 100644 index 0000000..fb49df2 --- /dev/null +++ b/tests/immaculate/cleanedBib.csv @@ -0,0 +1,15 @@ +Article,FA,LA,Title,SelfCite,CitationKey +2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory +3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019 +4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network +5,William,William,Moralia Volume VI,N,moralia2005 +6,Danielle,Perry,Curious Minds,N,bassett2022curious +7,Danielle,Jennifer,fake,Y,fake2022 +8,,JH,N-gram language models,N,jurafsky2018n +9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered +10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender +11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance +12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black +13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic +14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian +15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py new file mode 100644 index 0000000..999665b --- /dev/null +++ b/tests/immaculate/pipeline.py @@ -0,0 +1,37 @@ +import glob +from habanero import Crossref +import sys +import os +from pathlib import Path +wd = Path(os.getcwd()) +sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils') +from preprocessing import * + +cr = Crossref() +#homedir = '/home/jovyan/' +homedir = os.getcwd() + '/' +bib_files = glob.glob(homedir + '*.bib') +paper_aux_file = glob.glob(homedir + '*.aux') +paper_bib_file = 'library_paper.bib' +try: + tex_file = glob.glob(homedir + "*.tex")[0] +except: + print('No optional .tex file found.') + +yourFirstAuthor = 'Stiso, Jennifer ' +yourLastAuthor = 'Bassett, Dani ' +optionalEqualContributors = ['Zhou, Dale'] +checkingPublishedArticle = False + +## end of user input +if paper_aux_file: + find_unused_cites(paper_aux_file) + +bib_data = get_bib_data(homedir) +if checkingPublishedArticle: + get_names_published(homedir, bib_data, cr) +else: + # find and print duplicates + get_duplicates(bib_data) + # get names, remove CDS, find self cites + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/tests/immaculate/testBib_immaculate.bib b/tests/immaculate/testBib_immaculate.bib new file mode 100644 index 0000000..039fa4c --- /dev/null +++ b/tests/immaculate/testBib_immaculate.bib @@ -0,0 +1,257 @@ +@article{buzsaki2013memory, + title={Memory, navigation and theta rhythm in the hippocampal-entorhinal system}, + author={Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{Lundine2019, + + abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.}, + + author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina}, + + doi = {10.1016/j.socscimed.2019.112388}, + + file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf}, + + issn = {18735347}, + + journal = {Social Science and Medicine}, + + keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing}, + + number = {January}, + + pages = {112388}, + + pmid = {31288167}, + + publisher = {Elsevier}, + + title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}}, + + url = {https://doi.org/10.1016/j.socscimed.2019.112388}, + + volume = {235}, + + year = {2019} + +} + +@article{wang2021gendered, + + author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.}, + + year = {2021}, + + title = {Gendered citation practices in the field of communication}, + + journal = {Annals of the International Communication Association}, + + doi = {10.1080/23808985.2021.1960180}, +} + +@article{zurn2020network, + title={Network architectures supporting learnability}, + author={Zurn, Perry and Bassett, Danielle S}, + journal={Philosophical Transactions of the Royal Society B}, + volume={375}, + number={1796}, + pages={20190323}, + year={2020}, + publisher={The Royal Society} +} + +@book{moralia2005, + title={Moralia, Volume VI}, + author={Plutarch, Helmbold, William}, + year={1939}, + publisher={Harvard University Press} +} + +@book{bassett2022curious, +title={Curious Minds}, +author={Danielle S. Bassett and Perry Zurn}, +publisher={MIT Press}, +year={2022}, +} + +@book{fake2022, +title={fake}, +author={Danielle S. Bassett and Dale Zhou and Jennifer Stiso}, +publisher={MIT Press}, +year={2022}, +} + +@article{jurafsky2018n, + title={N-gram language models}, + author={Jurafsky, D and Martin, JH}, + journal={Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition}, + year={2018} +} + +@article {Dworkin2020.01.03.894378, + author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.}, + title = {The extent and drivers of gender imbalance in neuroscience reference lists}, + elocation-id = {2020.01.03.894378}, + year = {2020}, + doi = {10.1101/2020.01.03.894378}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.}, + URL = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378}, + eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf}, + journal = {bioRxiv} +} + +@article{maliniak2013gender, + title={The gender citation gap in international relations}, + author={Maliniak, Daniel and Powers, Ryan and Walter, Barbara F}, + journal={International Organization}, + volume={67}, + number={4}, + pages={889--922}, + year={2013}, + publisher={Cambridge University Press} +} + +@article{caplar2017quantitative, + title={Quantitative evaluation of gender bias in astronomical publications from citation counts}, + author={Caplar, Neven and Tacchella, Sandro and Birrer, Simon}, + journal={Nature Astronomy}, + volume={1}, + number={6}, + pages={0141}, + year={2017}, + publisher={Nature Publishing Group} +} + +@article{mitchell2013gendered, + title={Gendered citation patterns in international relations journals}, + author={Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly}, + journal={International Studies Perspectives}, + volume={14}, + number={4}, + pages={485--492}, + year={2013}, + publisher={Blackwell Publishing Ltd Oxford, UK} +} + +@article{dion2018gendered, + title={Gendered citation patterns across political science and social science methodology fields}, + author={Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin}, + journal={Political Analysis}, + volume={26}, + number={3}, + pages={312--327}, + year={2018}, + publisher={Cambridge University Press} +} + +@software{zhou_dale_2020_3672110, + author = {Zhou, Dale and + Cornblath, Eli J. and + Stiso, Jennifer and + Teich, Erin G. and + Dworkin, Jordan D. and + Blevins, Ann S. and + Bassett, Danielle S.}, + title = {Gender Diversity Statement and Code Notebook v1.0}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {v1.0}, + doi = {10.5281/zenodo.3672110}, + url = {https://doi.org/10.5281/zenodo.3672110} +} + +@inproceedings{ambekar2009name, + title={Name-ethnicity classification from open sources}, + author={Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven}, + booktitle={Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining}, + pages={49--58}, + year={2009} +} + +@article{sood2018predicting, + title={Predicting race and ethnicity from the sequence of characters in a name}, + author={Sood, Gaurav and Laohaprapanon, Suriyan}, + journal={arXiv preprint arXiv:1805.02109}, + year={2018} +} + +@article{bertolero2021racial, +title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender}, +author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.}, +journal = {bioRxiv}, +year = {2020}, +xoi = {10.1101/2020.10.12.336230}, +} + +@article{chatterjee2021gender, +journal = {JAMA Netw Open}, +year = {2021}, +volume = {4}, +number = {7}, +pages = {e2114509}, +title = {Gender Disparity in Citations in High-Impact Journal Articles}, +author = {Chatterjee, Paula and Werner, Rachel M}, +} + +@article{fulvio2021imbalance, +title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience}, +author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R}, +journal = {J Cogn Neurosci}, +year = {2021}, +volume = {33}, +number = {1}, +pages = {3-7}, +} + +@article{ethnicolr2022black, + title={Test of ethnicolr}, + author={Washington, Denzel and Brown-Jackson, Ketanji}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022hispanic, + title={Test of ethnicolr}, + author={Cruz, Rafael and Ocasio-Cortez, Alexandria}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022asian, + title={Test of ethnicolr}, + author={Wang, Andrew and Yeoh, Michelle}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} + +@article{ethnicolr2022white, + title={Test of ethnicolr}, + author={Coppola, Nicolas and Streep, Meryl}, + journal={Nature neuroscience}, + volume={16}, + number={2}, + pages={130}, + year={2013}, + publisher={Nature Publishing Group} +} \ No newline at end of file From f1988b9fbb5d95baf3cc8a58f48dd34f03ced3e3 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Wed, 11 May 2022 13:16:34 -0400 Subject: [PATCH 09/47] added function for finding unused citations in aux file --- __init__.py | 0 cleanBib.ipynb | 408 +------- tests/aux/checkcites.lua | 869 ++++++++++++++++++ tests/{immaculate => aux}/cleanedBib.csv | 0 tests/{ => aux}/document.aux | 0 tests/{ => aux}/pipeline.py | 20 +- tests/{ => aux}/testBib_immaculate.bib | 0 tests/cleanedBib.csv | 1 + tests/erroneous/pipeline.py | 37 + tests/{ => erroneous}/testBib_erroneous.bib | 0 tests/immaculate/cleanedBib_test.csv | 15 + .../__pycache__/preprocessing.cpython-39.pyc | Bin 0 -> 13102 bytes utils/__pycache__/queries.cpython-39.pyc | Bin 0 -> 838 bytes utils/preprocessing.py | 150 ++- 14 files changed, 1004 insertions(+), 496 deletions(-) create mode 100644 __init__.py create mode 100755 tests/aux/checkcites.lua rename tests/{immaculate => aux}/cleanedBib.csv (100%) rename tests/{ => aux}/document.aux (100%) rename tests/{ => aux}/pipeline.py (57%) rename tests/{ => aux}/testBib_immaculate.bib (100%) create mode 100644 tests/cleanedBib.csv create mode 100644 tests/erroneous/pipeline.py rename tests/{ => erroneous}/testBib_erroneous.bib (100%) create mode 100644 tests/immaculate/cleanedBib_test.csv create mode 100644 utils/__pycache__/preprocessing.cpython-39.pyc create mode 100644 utils/__pycache__/queries.cpython-39.pyc diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 2d7e005..a5f6cff 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -75,115 +75,6 @@ "from pybtex.database import parse_file\n", "import seaborn as sns\n", "\n", - "\n", - "def checkcites_output(aux_file):\n", - " '''take in aux file for tex document, return list of citation keys\n", - " that are in .bib file but not in document'''\n", - "\n", - " result = subprocess.run(['texlua', 'checkcites.lua', aux_file[0]], stdout=subprocess.PIPE)\n", - " result = result.stdout.decode('utf-8')\n", - " unused_array_raw = result.split('\\n')\n", - " # process array of unused references + other output \n", - " unused_array_final = list()\n", - " for x in unused_array_raw:\n", - " if len(x) > 0: # if line is not empty\n", - " if x[0] == '-': # and if first character is a '-', it's a citation key\n", - " unused_array_final.append(x[2:]) # truncate '- ' \n", - " if \"------------------------------------------------------------------------\" in unused_array_final:\n", - " return(result)\n", - " else:\n", - " return(unused_array_final)\n", - "\n", - "\n", - "def removeMiddleName(line):\n", - " arr = line.split()\n", - " last = arr.pop()\n", - " n = len(arr)\n", - " if n == 4:\n", - " first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])\n", - " elif n == 3:\n", - " first, middle = arr[0], ' '.join(arr[1:])\n", - " elif n == 2:\n", - " first, middle = arr\n", - " elif n==1:\n", - " return line\n", - " return(str(first + ' ' + middle))\n", - "\n", - "\n", - "def returnFirstName(line):\n", - " arr = line.split()\n", - " n = len(arr)\n", - " if n == 4:\n", - " first, middle = ' '.join(arr[:2]), ' '.join(arr[2:])\n", - " elif n == 3:\n", - " first, middle = arr[0], ' '.join(arr[1:])\n", - " elif n == 2:\n", - " first, middle = arr\n", - " elif n==1:\n", - " return line\n", - " return(str(middle))\n", - "\n", - "\n", - "def convertLatexSpecialChars(latex_text):\n", - " return LatexNodes2Text().latex_to_text(latex_text)\n", - "\n", - "\n", - "def convertSpecialCharsToUTF8(text):\n", - " data = LatexNodes2Text().latex_to_text(text)\n", - " return unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')\n", - "\n", - "\n", - "def namesFromXref(doi, title, authorPos):\n", - " '''Use DOI and article titles to query Crossref for author list'''\n", - " if authorPos == 'first':\n", - " idx = 0\n", - " elif authorPos == 'last':\n", - " idx = -1\n", - " # get cross ref data\n", - " authors = ['']\n", - " # first try DOI\n", - " if doi != \"\":\n", - " works = cr.works(query = title, select = [\"DOI\",\"author\"], limit=1, filter = {'doi': doi})\n", - " if works['message']['total-results'] > 0:\n", - " authors = works['message']['items'][0]['author']\n", - " elif title != '': \n", - " works = cr.works(query = f'title:\"{title}\"', select = [\"title\",\"author\"], limit=10)\n", - " cnt = 0\n", - " name = ''\n", - " # check that you grabbed the proper paper\n", - " if works['message']['items'][cnt]['title'][0].lower() == title.lower():\n", - " authors = works['message']['items'][0]['author']\n", - "\n", - " # check the all fields are available\n", - " if not 'given' in authors[idx]:\n", - " name = ''\n", - " else:\n", - " # trim initials\n", - " name = authors[idx]['given'].replace('.',' ').split()[0]\n", - "\n", - " return name\n", - "\n", - "\n", - "def namesFromXrefSelfCite(doi, title):\n", - " selfCiteCheck = 0\n", - " # get cross ref data\n", - " authors = ['']\n", - " # first try DOI\n", - " if doi != \"\":\n", - " works = cr.works(query = title, select = [\"DOI\",\"author\"], limit=1, filter = {'doi': doi})\n", - " if works['message']['total-results'] > 0:\n", - " authors = works['message']['items'][0]['author']\n", - " \n", - " for i in authors:\n", - " if i != \"\":\n", - " first = i['given'].replace('.',' ').split()[0]\n", - " last = i['family'].replace('.',' ').split()[0]\n", - " authors = removeMiddleName(last + \", \" + first)\n", - " if authors in removeMiddleName(yourFirstAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourFirstAuthor)) or authors in removeMiddleName(yourLastAuthor) or authors in removeMiddleName(convertSpecialCharsToUTF8(yourLastAuthor)):\n", - " selfCiteCheck += 1\n", - " return selfCiteCheck\n", - "\n", - "\n", "cr = Crossref()\n", "homedir = '/home/jovyan/'\n", "bib_files = glob.glob(homedir + '*.bib')\n", @@ -243,296 +134,17 @@ "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n", "checkingPublishedArticle = False\n", "\n", - "if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or (yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'):\n", - " raise ValueError(\"Please enter your manuscript's first and last author names\")\n", - "\n", "if paper_aux_file:\n", - " if optionalEqualContributors == ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']:\n", - " citing_authors = np.array([yourFirstAuthor, yourLastAuthor])\n", - " else:\n", - " citing_authors = np.array([yourFirstAuthor, yourLastAuthor, optionalEqualContributors])\n", - " print(checkcites_output(paper_aux_file))\n", - " unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper\n", - " print(\"Unused citations: \", unused_in_paper.count('=>'))\n", - " \n", - " \n", - " parser = BibTexParser()\n", - " parser.ignore_nonstandard_types = False\n", - " parser.common_strings = True\n", - " \n", - " bib_data = None\n", - " for bib_file in bib_files:\n", - " with open(bib_file) as bibtex_file:\n", - " if bib_data is None:\n", - " bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n", - " else:\n", - " bib_data_extra = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n", - " bib_data.entries_dict.update(bib_data_extra.entries_dict)\n", - " bib_data.entries.extend(bib_data_extra.entries)\n", - " \n", - " all_library_citations = list(bib_data.entries_dict.keys())\n", - " print(\"All citations: \", len(all_library_citations))\n", - " \n", - " for k in all_library_citations:\n", - " if re.search('\\\\b'+ k + '\\\\b', unused_in_paper.replace('\\n',' ').replace('=>',' ')) != None:\n", - " del bib_data.entries_dict[k] # remove from entries dictionary if not in paper\n", - " \n", - " in_paper_mask = [re.search('\\\\b'+ bib_data.entries[x]['ID'] + '\\\\b', unused_in_paper.replace('\\n',' ').replace('=>',' ')) == None for x in range(len(bib_data.entries))]\n", - " bib_data.entries = [bib_data.entries[x] for x in np.where(in_paper_mask)[0]] # replace entries list with entries only in paper\n", - " del bib_data.comments\n", - " \n", - " duplicates = []\n", - " for key in bib_data.entries_dict.keys():\n", - " count = str(bib_data.entries).count(\"'ID\\': \\'\"+ key + \"\\'\")\n", - " if count > 1:\n", - " duplicates.append(key)\n", - " \n", - " if len(duplicates) > 0:\n", - " raise ValueError(\"In your .bib file, please remove duplicate entries or duplicate entry ID keys for:\", ' '.join(map(str, duplicates)))\n", - "\n", - " if os.path.exists(paper_bib_file):\n", - " os.remove(paper_bib_file)\n", - " \n", - " with open(paper_bib_file, 'w') as bibtex_file:\n", - " bibtexparser.dump(bib_data, bibtex_file)\n", - " \n", - " # define first author and last author names of citing paper -- will exclude citations of these authors\n", - " # beware of latex symbols within author names\n", - " # in_paper_citations = list(bib_data.entries_dict.keys())\n", - " in_paper_citations = [bib_data.entries[x]['ID'] for x in range(len(bib_data.entries))] # get list of citation keys in paper\n", - " \n", - " # extract author list for every cited paper\n", - " cited_authors = [bib_data.entries_dict[x]['author'] for x in in_paper_citations]\n", - " # find citing authors in cited author list\n", - " # using nested list comprehension, make a citing author -by- citation array of inclusion\n", - " self_cite_mask = np.array([[str(citing_author) in authors for authors in cited_authors] for citing_author in citing_authors])\n", - " self_cite_mask = np.any(self_cite_mask,axis=0) # collapse across citing authors such that any coauthorship by either citing author -> exclusion\n", - " \n", - " print(\"Self-citations: \", [bib_data.entries[x]['ID'] for x in np.where(self_cite_mask)[0]]) # print self citations\n", - " for idx,k in enumerate(in_paper_citations):\n", - " if self_cite_mask[idx]:\n", - " del bib_data.entries_dict[k] # delete citation from dictionary if self citationi\n", - " bib_data.entries = [bib_data.entries[x] for x in np.where(np.invert(self_cite_mask))[0]] # replace entries list with entries that aren't self citations\n", - " \n", - " paper_bib_file_excl_sc = os.path.splitext(paper_bib_file)[0] + '_noselfcite.bib'\n", - " \n", - " if os.path.exists(paper_bib_file_excl_sc):\n", - " os.remove(paper_bib_file_excl_sc)\n", - " \n", - " with open(paper_bib_file_excl_sc, 'w') as bibtex_file:\n", - " bibtexparser.dump(bib_data, bibtex_file)\n", - " \n", - " ID = glob.glob(homedir + paper_bib_file_excl_sc)\n", - "else:\n", - " ID = glob.glob(homedir + '*bib')\n", - " with open(ID[0]) as bibtex_file:\n", - " bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file)\n", - " duplicates = []\n", - " for key in bib_data.entries_dict.keys():\n", - " count = str(bib_data.entries).count(\"'ID\\': \\'\"+ key + \"\\'\")\n", - " if count > 1:\n", - " duplicates.append(key)\n", - " \n", - " if len(duplicates) > 0:\n", - " raise ValueError(\"In your .bib file, please remove duplicate entries or duplicate entry ID keys for:\", ' '.join(map(str, duplicates)))\n", - "\n", - "if checkingPublishedArticle == True:\n", - " FA = []\n", - " LA = []\n", - " counter = 1\n", - " selfCiteCount = 0\n", - " titleCount = 1 # \n", - " counterNoDOI = list() # row index (titleCount) of entries with no DOI\n", - " outPath = homedir + 'cleanedBib.csv'\n", - "\n", - " if os.path.exists(outPath):\n", - " os.remove(outPath)\n", - "\n", - " with open(outPath, 'w', newline='') as csvfile:\n", - " writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", - " writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])\n", - " \n", - " citedArticleDOI = list()\n", - " citedArticleNoDOI = list()\n", - " allArticles = list()\n", - " for entry in bib_data.entries:\n", - " my_string= entry['cited-references'].split('\\n')\n", - " for citedArticle in my_string:\n", - " allArticles.append(citedArticle)\n", - " if citedArticle.partition(\"DOI \")[-1]=='':\n", - " citedArticleNoDOI.append(citedArticle)\n", - " counterNoDOI.append(titleCount)\n", - " else:\n", - " line = citedArticle.partition(\"DOI \")[-1].replace(\"DOI \",\"\").rstrip(\".\")\n", - " line = ''.join( c for c in line if c not in '{[}] ')\n", - " if \",\" in line:\n", - " line = line.partition(\",\")[-1]\n", - " citedArticleDOI.append(line)\n", - " with open('citedArticlesDOI.csv', 'a', newline='') as csvfile:\n", - " writer = csv.writer(csvfile, delimiter=',')\n", - " writer.writerow([line])\n", - " titleCount += 1\n", - "\n", - " articleNum = 0\n", - " for doi in citedArticleDOI:\n", - " try:\n", - " FA = namesFromXref(doi, '', 'first')\n", - " except UnboundLocalError:\n", - " sleep(1)\n", - " continue\n", - "\n", - " try:\n", - " LA = namesFromXref(doi, '', 'last')\n", - " except UnboundLocalError:\n", - " sleep(1)\n", - " continue\n", - "\n", - " try:\n", - " selfCiteCount = namesFromXrefSelfCite(doi, '')\n", - " except UnboundLocalError:\n", - " sleep(1)\n", - " continue\n", - "\n", - " with open(outPath, 'a', newline='') as csvfile: \n", - " if selfCiteCount == 0:\n", - " writer = csv.writer(csvfile, delimiter=',')\n", - " getArticleIndex = [i for i, s in enumerate(allArticles) if doi in s]\n", - " writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), allArticles[[i for i, s in enumerate(allArticles) if doi in s][0]], '', ''])\n", - " print(str(counter) + \": \" + doi )\n", - " counter += 1\n", - " else:\n", - " print(str(articleNum) + \": \" + doi + \"\\t\\t\\t <-- self-citation\" )\n", - " articleNum += 1\n", - "\n", - " if len(citedArticleNoDOI)>0:\n", - " print()\n", - " for elem in citedArticleNoDOI:\n", - " with open(outPath, 'a', newline='') as csvfile: \n", - " writer = csv.writer(csvfile, delimiter=',')\n", - " writer.writerow([counter, '', '', elem, '', ''])\n", - " print(str(counter) + \": \" + elem )\n", - " counter += 1\n", - " print()\n", - " raise ValueError(\"WARNING: No article DOI was provided for the last \" + str(len(citedArticleNoDOI)) + \" listed papers. Please manually search for these articles. IF AND ONLY IF your citing paper's first and last author are not co-authors in the paper that was cited, enter the first name of the first and last authors of the paper that was cited manually. Then, continue to the next code block.\")\n", + " find_unused_cites(paper_aux_file)\n", + "\n", + "bib_data = get_bib_data(homedir)\n", + "if checkingPublishedArticle:\n", + " get_names_published(homedir, bib_data, cr)\n", "else:\n", - " FA = []\n", - " LA = []\n", - " parser = bibtex.Parser()\n", - " bib_data = parser.parse_file(ID[0])\n", - " counter = 1\n", - " nameCount = 0\n", - " outPath = homedir + 'cleanedBib.csv'\n", - "\n", - " if os.path.exists(outPath):\n", - " os.remove(outPath)\n", - "\n", - " with open(outPath, 'w', newline='') as csvfile:\n", - " writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", - " writer.writerow(['Article', 'FA', 'LA', 'Title', 'SelfCite', 'CitationKey'])\n", - "\n", - " for key in bib_data.entries.keys():\n", - " diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',\n", - " 'The gender citation gap in international relations',\n", - " 'Quantitative evaluation of gender bias in astronomical publications from citation counts',\n", - " '\\# CommunicationSoWhite',\n", - " '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',\n", - " 'Gendered citation patterns across political science and social science methodology fields',\n", - " 'Gender Diversity Statement and Code Notebook v1.0', \n", - " 'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender', \n", - " 'Gender Diversity Statement and Code Notebook v1.1',\n", - " 'Gendered citation practices in the field of communication',\n", - " 'Gender disparity in citations in high- impact journal articles',\n", - " 'Gender (im)balance in citation practices in cognitive neuroscience',\n", - " 'Name-ethnicity classification from open sources',\n", - " 'Predicting race and ethnicity from the sequence of characters in a name']\n", - " if bib_data.entries[key].fields['title'] in diversity_bib_titles:\n", - " continue\n", - "\n", - " try:\n", - " author = bib_data.entries[key].persons['author']\n", - " except:\n", - " author = bib_data.entries[key].persons['editor']\n", - " FA = author[0].rich_first_names\n", - " LA = author[-1].rich_first_names\n", - " FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n", - " LA = convertLatexSpecialChars(str(LA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n", - " \n", - " # check if we grabbed a first initial when a full middle name was available\n", - " if (len(FA) == 1):\n", - " mn = author[0].rich_middle_names\n", - " mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n", - " if len(mn) > 1:\n", - " FA = mn\n", - " if (len(LA) == 1):\n", - " mn = author[-1].rich_middle_names\n", - " mn = convertLatexSpecialChars(str(mn)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace('Protected',\"\").replace(\" \",'')\n", - " if len(mn) > 1:\n", - " LA = mn\n", - " \n", - " # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API\n", - " try:\n", - " title = bib_data.entries[key].fields['title'].replace(',', '').replace(',', '').replace('{','').replace('}','')\n", - " except:\n", - " title = ''\n", - " try:\n", - " doi = bib_data.entries[key].fields['doi']\n", - " except:\n", - " doi = ''\n", - " if FA == '' or len(FA.split('.')[0]) <= 1:\n", - " while True:\n", - " try:\n", - " FA = namesFromXref(doi, title, 'first')\n", - " except UnboundLocalError:\n", - " sleep(1)\n", - " continue\n", - " break\n", - " if LA == '' or len(LA.split('.')[0]) <= 1:\n", - " while True:\n", - " try:\n", - " LA = namesFromXref(doi, title, 'last')\n", - " except UnboundLocalError:\n", - " sleep(1)\n", - " continue\n", - " break\n", - "\n", - " if (yourFirstAuthor!='LastName, FirstName OptionalMiddleInitial') and (yourLastAuthor!='LastName, FirstName OptionalMiddleInitial'):\n", - " selfCiteCheck1 = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " selfCiteCheck1a = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " selfCiteCheck1b = [s for s in author if removeMiddleName(yourLastAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), LA]).replace(\"'\", \"\")]\n", - "\n", - " selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), FA]).replace(\"'\", \"\")]\n", - "\n", - " nameCount = 0\n", - " if optionalEqualContributors != ('LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'):\n", - " for name in optionalEqualContributors:\n", - " selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertLatexSpecialChars(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([convertSpecialCharsToUTF8(str(s.rich_last_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\"), convertSpecialCharsToUTF8(str(s.rich_first_names)[7:-3]).replace(\"', Protected('\",\"\").replace(\"'), '\", \"\")]).replace(\"'\", \"\")]\n", - " if len(selfCiteCheck3)>0:\n", - " nameCount += 1\n", - " if len(selfCiteCheck3a)>0:\n", - " nameCount += 1\n", - " selfCiteChecks = [selfCiteCheck1, selfCiteCheck1a, selfCiteCheck1b, selfCiteCheck2, selfCiteCheck2a, selfCiteCheck2b]\n", - " if sum([len(check) for check in selfCiteChecks]) + nameCount > 0:\n", - " selfCite = 'Y'\n", - " if len(FA) < 2:\n", - " print(str(counter) + \": \" + key + \"\\t\\t <-- self-citation <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\")\n", - " else:\n", - " print(str(counter) + \": \" + key + \" <-- self-citation\")\n", - " else:\n", - " selfCite= 'N'\n", - " if len(FA) < 2:\n", - " print(str(counter) + \": \" + key + \"\\t\\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\")\n", - " else:\n", - " print(str(counter) + \": \" + key)\n", - " else:\n", - " selfCite = 'NA'\n", - "\n", - " with open(outPath, 'a', newline='') as csvfile:\n", - " writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)\n", - " writer.writerow([counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key])\n", - " counter += 1" + " # find and print duplicates\n", + " get_duplicates(bib_data)\n", + " # get names, remove CDS, find self cites\n", + " get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)" ] }, { @@ -1111,4 +723,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file diff --git a/tests/aux/checkcites.lua b/tests/aux/checkcites.lua new file mode 100755 index 0000000..6191ca2 --- /dev/null +++ b/tests/aux/checkcites.lua @@ -0,0 +1,869 @@ +#!/usr/bin/env texlua +-- ----------------------------------------------------------------- +-- checkcites.lua +-- Copyright 2012, 2019, Enrico Gregorio, Paulo Roberto Massa Cereda +-- +-- This work may be distributed and/or modified under the conditions +-- of the LaTeX Project Public License, either version 1.3 of this +-- license or (at your option) any later version. +-- +-- The latest version of this license is in +-- +-- http://www.latex-project.org/lppl.txt +-- +-- and version 1.3 or later is part of all distributions of LaTeX +-- version 2005/12/01 or later. +-- +-- This work has the LPPL maintenance status `maintained'. the +-- current maintainers of this work are the original authors. This +-- work consists of the file checkcites.lua. +-- +-- Project repository: http://github.com/cereda/checkcites +-- ----------------------------------------------------------------- + +-- Checks if the table contains the element. +-- @param a Table. +-- @param hit Element. +-- @return Boolean value if the table contains the element. +local function exists(a, hit) + for _, v in ipairs(a) do + if v == hit then + return true + end + end + return false +end + +-- Parses the list of arguments based on a configuration map. +-- @param map Configuration map. +-- @param args List of command line arguments. +-- @return Table containing the valid keys and entries. +-- @return Table containing the invalid keys. +local function parse(map, args) + local keys, key, unknown = {}, 'unpaired', {} + local a, b + for _, v in ipairs(args) do + a, _, b = string.find(v, '^%-(%w)$') + if a then + for _, x in ipairs(map) do + key = 'unpaired' + if x['short'] == b then + key = x['long'] + break + end + end + if key == 'unpaired' then + table.insert(unknown, '-' .. b) + end + if not keys[key] then + keys[key] = {} + end + else + a, _, b = string.find(v, '^%-%-([%w-]+)$') + if a then + for _, x in ipairs(map) do + key = 'unpaired' + if x['long'] == b then + key = b + break + end + end + if key == 'unpaired' then + if not exists(unknown, '--' .. b) then + table.insert(unknown, '--' .. b) + end + end + if not keys[key] then + keys[key] = {} + end + else + if not keys[key] then + keys[key] = {} + end + if key ~= 'unpaired' then + for _, x in ipairs(map) do + if x['long'] == key then + if not (x['argument'] and + #keys[key] == 0) then + key = 'unpaired' + end + break + end + end + if not keys[key] then + keys[key] = {} + end + table.insert(keys[key], v) + else + if not keys[key] then + keys[key] = {} + end + table.insert(keys[key], v) + end + end + end + end + return keys, unknown +end + +-- Calculates the difference between two tables. +-- @param a First table. +-- @param b Second table. +-- @return Table containing the difference between two tables. +local function difference(a, b) + local result = {} + for _, v in ipairs(a) do + if not exists(b, v) then + table.insert(result, v) + end + end + return result +end + +-- Splits the string based on a pattern. +-- @param str String. +-- @param pattern Pattern. +local function split(str, pattern) + local result = {} + string.gsub(str, pattern, function(a) + table.insert(result, a) end) + return result +end + +-- Reads lines from a file. +-- @param file File. +-- @returns Table representing the lines. +local function read(file) + local handler = io.open(file, 'r') + local lines = {} + if handler then + for line in handler:lines() do + table.insert(lines, line) + end + handler:close() + end + return lines +end + +-- Gets a pluralized word based on a counter. +-- @param i Counter. +-- @param a Word in singular. +-- @param b Word in plural. +-- @return Either the first or second word based on the counter. +local function plural(i, a, b) + if i == 1 then + return a + else + return b + end +end + +-- Normalizes the string, removing leading and trailing spaces. +-- @param str String. +-- @return Normalized string without leading and trailing spaces. +local function normalize(str) + local result, _ = string.gsub(str, '^%s', '') + result, _ = string.gsub(result, '%s$', '') + return result +end + +-- Checks if the element is in a blacklist. +-- @param a Element. +-- @return Boolean value if the element is blacklisted. +local function blacklist(a) + local list = {} + for _, v in ipairs(list) do + if v == a then + return true + end + end + return false +end + +-- Checks if the key is allowed. +-- @param v The key itself. +-- @return Boolean value if the key is allowed. +local function allowed(key) + local keys = { 'string', 'comment' } + for _, v in ipairs(keys) do + if string.lower(key) == v then + return false + end + end + return true +end + +-- Extracts the biblographic key. +-- @param lines Lines of a file. +-- @return Table containing bibliographic keys. +local function extract(lines) + local result = {} + for _, line in ipairs(lines) do + local key, hit = string.match(line, + '^%s*%@(%w+%s*){%s*(.+),') + if key and allowed(key) then + if not exists(result, hit) then + hit = normalize(hit) + table.insert(result, hit) + end + end + end + return result +end + +-- Extracts the cross-references found +-- in lines of the bibligraphy file. +-- @param lines Line of a file. +-- @return Table containing cross-references. +local function crossref(lines) + local result, lookup, key, hit = {}, '' + for _, line in ipairs(lines) do + key, hit = string.match(line, + '^%s*%@(%w+%s*){%s*(.+),') + if key and allowed(key) then + lookup = normalize(hit) + else + key, hit = string.match(line, + '^%s*(%w+)%s*=%s*(.+)$') + if key then + key = string.lower(key) + if key == 'crossref' then + if string.sub(hit, -1) == ',' then + hit = string.sub(hit, 2, -3) + else + hit = string.sub(hit, 2, -2) + end + result[lookup] = hit + end + end + end + end + return result +end + +-- Adds the extension if the file does not have it. +-- @param file File. +-- @param extension Extension. +-- @return File with proper extension. +local function sanitize(file, extension) + extension = '.' .. extension + if string.sub(file, -#extension) ~= extension then + file = file .. extension + end + return file +end + +-- Checks if a file exists. +-- @param file File. +-- @return Boolean value indicating if the file exists. +local function valid(file) + local handler = io.open(file, 'r') + if handler then + handler:close() + return true + else + return false + end +end + +-- Wraps a string based on a line width. +-- @param str String. +-- @param size Line width. +-- @return Wrapped string. +local function wrap(str, size) + local parts = split(str, '[^%s]+') + local r, l = '', '' + for _, v in ipairs(parts) do + if (#l + #v) > size then + r = r .. '\n' .. l + l = v + else + l = normalize(l .. ' ' .. v) + end + end + r = normalize(r .. '\n' .. l) + return r +end + +-- Backend namespace +local backends = {} + +-- Gets data from auxiliary files (BibTeX). +-- @param lines Lines of a file. +-- @param rec Recursive switch. +-- @return Boolean indicating if an asterisk was found. +-- @return Table containing the citations. +-- @return Table containing the bibliography files. +backends.bibtex = function(lines, rec) + local citations, bibliography, invalid = {}, {}, {} + local asterisk, parts, hit = false + for _, line in ipairs(lines) do + hit = string.match(line, '^%s*\\citation{(.+)}$') + if hit then + if hit ~= '*' then + parts = split(hit, '[^,%s]+') + for _, v in ipairs(parts) do + v = normalize(v) + if not exists(citations, v) then + table.insert(citations, v) + end + end + else + asterisk = true + end + else + hit = string.match(line, '^%s*\\bibdata{(.+)}$') + if hit then + parts = split(hit, '[^,%s]+') + for _, v in ipairs(parts) do + v = normalize(v) + if not exists(bibliography, v) and + not blacklist(v) then + table.insert(bibliography, v) + end + end + else + hit = string.match(line, '^%s*\\@input{(.+)}$') + if rec and hit then + hit = sanitize(hit, 'aux') + if not valid(hit) then + table.insert(invalid, hit) + else + local a, b, c = backends.bibtex(read(hit), false) + asterisk = asterisk or a + for _, v in ipairs(b) do + if not exists(citations, v) then + table.insert(citations, v) + end + end + for _, v in ipairs(c) do + if not exists(bibliography, v) then + table.insert(bibliography, v) + end + end + end + end + end + end + end + if #invalid ~= 0 then + print() + print(wrap('Warning: there ' .. plural(#invalid, + 'is an invalid reference ', 'are ' .. + 'invalid references ') .. 'to the ' .. + 'following auxiliary ' .. plural(#invalid, + 'file ', 'files ') .. 'that could not ' .. + 'be resolved at runtime:', 74)) + for _, v in ipairs(invalid) do + print('=> ' .. v) + end + end + return asterisk, citations, bibliography +end + +-- Gets data from auxiliary files (Biber). +-- @param lines Lines of a file. +-- @param _ To be discarded with biber. +-- @return Boolean indicating if an asterisk was found. +-- @return Table containing the citations. +-- @return Table containing the bibliography files. +backends.biber = function(lines, _) + local citations, bibliography = {}, {} + local asterisk, parts, hit = false + for _, line in ipairs(lines) do + hit = string.match(line, '^%s*' .. + '(.+)$') + if hit then + if hit ~= '*' then + parts = split(hit, '[^,%s]+') + for _, v in ipairs(parts) do + v = normalize(v) + if not exists(citations, v) then + table.insert(citations, v) + end + end + else + asterisk = true + end + else + hit = string.match(line, '^%s*(.+)$') + if hit then + parts = split(hit, '[^,%s]+') + for _, v in ipairs(parts) do + v = normalize(v) + if not exists(bibliography, v) and + not blacklist(v) then + table.insert(bibliography, v) + end + end + end + end + end + return asterisk, citations, bibliography +end + +-- Counts the number of elements of a nominal table. +-- @param t Table. +-- @return Table size. +local function count(t) + local counter = 0 + for _, _ in pairs(t) do + counter = counter + 1 + end + return counter +end + +-- Repeats the provided char a certain number of times. +-- @param c Char. +-- @param size Number of times. +-- @return String with a char repeated a certain number of times. +local function pad(c, size) + local r = c + while #r < size do + r = r .. c + end + return r +end + +-- Flattens a table of tables into only one table. +-- @param t Table. +-- @return Flattened table. +local function flatten(t) + local result = {} + for _, v in ipairs(t) do + for _, k in ipairs(v) do + if not exists(result, k) then + table.insert(result, k) + end + end + end + return result +end + +-- Organizes a key/value table of tables into only one table. +-- @param t Table. +-- @return Flattened key/value table. +local function organize(t) + local result = {} + for _, v in ipairs(t) do + for j, k in pairs(v) do + if not result[j] then + result[j] = k + end + end + end + return result +end + +-- Applies a function to elements of a table. +-- @param c Table. +-- @param f Function. +-- @return A new table. +local function apply(c, f) + local result = {} + for _, v in ipairs(c) do + table.insert(result, f(v)) + end + return result +end + +-- Search the TeX tree for the file. +-- @param library The library reference. +-- @param file The filename. +-- @param extension The extension. +-- @return String pointing to the file location. +local function lookup(library, file, extension) + return library.find_file(file, extension) +end + +-- Prints the script header. +local function header() +print(" _ _ _ _") +print(" ___| |_ ___ ___| |_ ___|_| |_ ___ ___") +print("| _| | -_| _| '_| _| | _| -_|_ -|") +print("|___|_|_|___|___|_,_|___|_|_| |___|___|") +print() + print(wrap('checkcites.lua -- a reference ' .. + 'checker script (v2.4)', 74)) + print(wrap('Copyright (c) 2012, 2019, ' .. + 'Enrico Gregorio, Paulo ' .. + 'Roberto Massa Cereda', 74)) +end + +-- Operation namespace +local operations = {} + +-- Reports the unused references. +-- @param citations Citations. +-- @param references References. +-- @return Integer representing the status. +operations.unused = function(citations, references, crossrefs) + print() + print(pad('-', 74)) + print(wrap('Report of unused references in your TeX ' .. + 'document (that is, references present in ' .. + 'bibliography files, but not cited in ' .. + 'the TeX source file)', 74)) + print(pad('-', 74)) + + local z = {} + for _, citation in ipairs(citations) do + if crossrefs[citation] then + table.insert(z, crossrefs[citation]) + end + end + + for _, i in ipairs(z) do + if not exists(i, citations) then + table.insert(citations, i) + end + end + + local r = difference(references, citations) + print() + print(wrap('Unused references in your TeX document: ' .. + tostring(#r), 74)) + if #r == 0 then + return 0 + else + for _, v in ipairs(r) do + print('=> ' .. v) + end + return 1 + end +end + +-- Reports the undefined references. +-- @param citations Citations. +-- @param references References. +-- @return Integer value indicating the status. +operations.undefined = function(citations, references, crossrefs) + print() + print(pad('-', 74)) + print(wrap('Report of undefined references in your TeX ' .. + 'document (that is, references cited in the ' .. + 'TeX source file, but not present in the ' .. + 'bibliography files)', 74)) + print(pad('-', 74)) + + local z = {} + for _, citation in ipairs(citations) do + if crossrefs[citation] then + table.insert(z, crossrefs[citation]) + end + end + + for _, i in ipairs(z) do + if not exists(i, citations) then + table.insert(citations, i) + end + end + + local r = difference(citations, references) + print() + print(wrap('Undefined references in your TeX document: ' .. + tostring(#r), 74)) + if #r == 0 then + return 0 + else + for _, v in ipairs(r) do + print('=> ' .. v) + end + return 1 + end +end + +-- Reports both unused and undefined references. +-- @param citations Citations. +-- @param references References. +-- @return Integer value indicating the status. +operations.all = function(citations, references, crossrefs) + local x, y + x = operations.unused(citations, references, crossrefs) + y = operations.undefined(citations, references, crossrefs) + if x + y > 0 then + return 1 + else + return 0 + end +end + +-- Filters a table of files, keeping the inexistent ones. +-- @param files Table. +-- @param lib Search library. +-- @param enabled Boolean switch to enable lookup. +-- @param extension Extension for lookup. +-- @return Table of inexistent files. +-- @return Table of existent files. +local function validate(files, lib, enabled, extension) + local bad, good = {}, {} + for _, v in ipairs(files) do + if not valid(v) then + if enabled and lookup(lib, v, extension) then + table.insert(good, lookup(lib, v, extension)) + else + table.insert(bad, v) + end + else + table.insert(good, v) + end + end + return bad, good +end + +-- Main function. +-- @param args Command line arguments. +-- @return Integer value indicating the status +local function checkcites(args) + + local kpse = require('kpse') + kpse.set_program_name('texlua') + + header() + + local parameters = { + { short = 'a', long = 'all', argument = false }, + { short = 'u', long = 'unused', argument = false }, + { short = 'U', long = 'undefined', argument = false }, + { short = 'v', long = 'version', argument = false }, + { short = 'h', long = 'help', argument = false }, + { short = 'c', long = 'crossrefs', argument = false }, + { short = 'b', long = 'backend', argument = true } + } + + local keys, err = parse(parameters, args) + local check, backend = 'all', 'bibtex' + + if #err ~= 0 then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but I do not recognize ' .. + 'the following ' .. plural(#err, 'option', + 'options') .. ':', 74)) + for _, v in ipairs(err) do + print('=> ' .. v) + end + + print() + print(wrap('Please make sure to use the correct ' .. + 'options when running this script. You ' .. + 'can also refer to the user documentation ' .. + 'for a list of valid options. The script ' .. + 'will end now.', 74)) + return 1 + end + + if count(keys) == 0 then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but you have not provided ' .. + 'any command line argument, including ' .. + 'files to check and options. Make ' .. + 'sure to invoke the script with the actual ' .. + 'arguments. Refer to the user documentation ' .. + 'if you are unsure of how this tool ' .. + 'works. The script will end now.', 74)) + return 1 + end + + if keys['version'] or keys['help'] then + if keys['version'] then + print() + print(wrap('checkcites.lua, version 2.4 (dated September ' .. + '3, 2019)', 74)) + + print(pad('-', 74)) + print(wrap('You can find more details about this ' .. + 'script, as well as the user documentation, ' .. + 'in the official source code repository:', 74)) + + print() + print('https://github.com/cereda/checkcites') + + print() + print(wrap('The checkcites.lua script is licensed ' .. + 'under the LaTeX Project Public License, ' .. + 'version 1.3. The current maintainers ' .. + 'are the original authors.', 74)) + else + print() + print(wrap('Usage: ' .. args[0] .. ' [ [ --all | --unused | ' .. + '--undefined ] [ --backend ] [ ' .. + ' ... ] | --help | --version ' .. + ']', 74)) + + print() + print('-a,--all list all unused and undefined references') + print('-u,--unused list only unused references in your bibliography files') + print('-U,--undefined list only undefined references in your TeX source file') + print('-c,--crossrefs enable cross-reference checks (disabled by default)') + print('-b,--backend set the backend-based file lookup policy') + print('-h,--help print the help message') + print('-v,--version print the script version') + + print() + print(wrap('Unless specified, the script lists all unused and ' .. + 'undefined references by default. Also, the default ' .. + 'backend is set to "bibtex". Please refer to the user ' .. + 'documentation for more details.', 74)) + end + return 0 + end + + if not keys['unpaired'] then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but you have not provided ' .. + 'files to process. The tool requires ' .. + 'least one file in order to properly ' .. + 'work. Make sure to invoke the script ' .. + 'with an actual file (or files). Refer ' .. + 'to the user documentation if you are ' .. + 'unsure of how this tool works. The ' .. + 'script will end now.', 74)) + return 1 + end + + if keys['backend'] then + if not exists({ 'bibtex', 'biber' }, keys['backend'][1]) then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but you provided an ' .. + 'invalid backend. I know two: ' .. + '"bibtex" (which is the default ' .. + 'one) and "biber". Please make ' .. + 'sure to select one of the two. ' .. + 'Also refer to the user documentation ' .. + 'for more information on how these ' .. + 'backends work. The script will end ' .. + 'now.', 74)) + return 1 + else + backend = keys['backend'][1] + end + end + + if not keys['all'] then + if keys['unused'] and keys['undefined'] then + check = 'all' + elseif keys['unused'] or keys['undefined'] then + check = (keys['unused'] and 'unused') or + (keys['undefined'] and 'undefined') + end + end + + local auxiliary = apply(keys['unpaired'], function(a) + return sanitize(a, (backend == 'bibtex' + and 'aux') or 'bcf') end) + + local invalid, _ = validate(auxiliary, kpse, false, 'aux') + if #invalid ~= 0 then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but I was unable to ' .. + 'locate ' .. plural(#invalid, 'this file', + 'these files') .. ' (the extension ' .. + 'is automatically set based on the ' .. + '"' .. backend .. '" backend):', 74)) + for _, v in ipairs(invalid) do + print('=> ' .. v) + end + + print() + print(wrap('Selected backend: ' .. backend, 74)) + print(wrap('File lookup policy: add ".' .. + ((backend == 'bibtex' and 'aux') or 'bcf') .. + '" to files if not provided.', 74)) + + print() + print(wrap('Please make sure the ' .. plural(#invalid, + 'path is', 'paths are') .. ' ' .. + 'correct and the ' .. plural(#invalid, + 'file exists', 'files exist') .. '. ' .. + 'There is nothing I can do at the moment. ' .. + 'Refer to the user documentation for ' .. + 'details on the file lookup. If ' .. plural(#invalid, + 'this is not the file', 'these are not the ' .. + 'files') .. ' you were expecting, ' .. + 'double-check your source file or ' .. + 'change the backend option when running ' .. + 'this tool. The script will end now.', 74)) + return 1 + end + + local lines = flatten(apply(auxiliary, read)) + local asterisk, citations, bibliography = backends[backend](lines, true) + + print() + print(wrap('Great, I found ' .. tostring(#citations) .. ' ' .. + plural(#citations, 'citation', 'citations') .. ' in ' .. + tostring(#auxiliary) .. ' ' .. plural(#auxiliary, 'file', + 'files') ..'. I also found ' .. tostring(#bibliography) .. + ' ' .. 'bibliography ' .. plural(#bibliography, 'file', + 'files') .. '. Let me check ' .. plural(#bibliography, + 'this file', 'these files') .. ' and extract the ' .. + 'references. Please wait a moment.', 74)) + + if asterisk then + print() + print(wrap('Also, it is worth noticing that I found a mention to ' .. + 'a special "*" when retrieving citations. That means ' .. + 'your TeX document contains "\\nocite{*}" somewhere in ' .. + 'the source code. I will continue with the check ' .. + 'nonetheless.', 74)) + end + + bibliography = apply(bibliography, function(a) + return sanitize(a, 'bib') end) + + invalid, bibliography = validate(bibliography, kpse, true, 'bib') + if #invalid ~= 0 then + print() + print(pad('-', 74)) + print(wrap('I am sorry, but I was unable to locate ' .. + plural(#invalid, 'this file', 'these files') .. ' ' .. + '(the extension is automatically set to ' .. + '".bib", if not provided):', 74)) + for _, v in ipairs(invalid) do + print('=> ' .. v) + end + + print() + print(wrap('Please make sure the ' .. plural(#invalid, + 'path is', 'paths are') .. ' ' .. + 'correct and the ' .. plural(#invalid, + 'file exists', 'files exist') .. '. ' .. + 'There is nothing I can do at the moment. ' .. + 'Refer to to the user documentation ' .. + 'for details on bibliography lookup. If ' .. + plural(#invalid, 'this is not the file', + 'these are not the files') .. ' you were ' .. + 'expecting (wrong bibliography), double-check ' .. + 'your source file. The script will end ' .. + 'now.', 74)) + return 1 + end + + local references = flatten(apply(bibliography, function(a) + return extract(read(a)) end)) + + local crossrefs = (keys['crossrefs'] and organize(apply(bibliography, + function(a) return crossref(read(a)) end))) or {} + + print() + print(wrap('Fantastic, I found ' .. tostring(#references) .. + ' ' .. plural(#references, 'reference', + 'references') .. ' in ' .. tostring(#bibliography) .. + ' bibliography ' .. plural(#bibliography, 'file', + 'files') .. '. Please wait a moment while the ' .. + plural(((check == 'all' and 2) or 1), 'report is', + 'reports are') .. ' generated.', 74)) + + return operations[check](citations, references, crossrefs) +end + +-- Call and exit +os.exit(checkcites(arg)) + +-- EOF diff --git a/tests/immaculate/cleanedBib.csv b/tests/aux/cleanedBib.csv similarity index 100% rename from tests/immaculate/cleanedBib.csv rename to tests/aux/cleanedBib.csv diff --git a/tests/document.aux b/tests/aux/document.aux similarity index 100% rename from tests/document.aux rename to tests/aux/document.aux diff --git a/tests/pipeline.py b/tests/aux/pipeline.py similarity index 57% rename from tests/pipeline.py rename to tests/aux/pipeline.py index f0e1e70..999665b 100644 --- a/tests/pipeline.py +++ b/tests/aux/pipeline.py @@ -2,14 +2,14 @@ from habanero import Crossref import sys import os -wd = os.getcwd() -print(f'{wd[0:-6]}/utils') -print(f'{wd[0:-6]}/utils') -sys.path.insert(1, f'{wd[0:-6]}/utils') +from pathlib import Path +wd = Path(os.getcwd()) +sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils') from preprocessing import * cr = Crossref() -homedir = '/home/jovyan/' +#homedir = '/home/jovyan/' +homedir = os.getcwd() + '/' bib_files = glob.glob(homedir + '*.bib') paper_aux_file = glob.glob(homedir + '*.aux') paper_bib_file = 'library_paper.bib' @@ -18,9 +18,9 @@ except: print('No optional .tex file found.') -yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial' -yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial' -optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'] +yourFirstAuthor = 'Stiso, Jennifer ' +yourLastAuthor = 'Bassett, Dani ' +optionalEqualContributors = ['Zhou, Dale'] checkingPublishedArticle = False ## end of user input @@ -29,9 +29,9 @@ bib_data = get_bib_data(homedir) if checkingPublishedArticle: - FA,LA = get_names_published(homedir, bib_data) + get_names_published(homedir, bib_data, cr) else: # find and print duplicates get_duplicates(bib_data) # get names, remove CDS, find self cites - FA,LA = get_names(bib_data) \ No newline at end of file + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/tests/testBib_immaculate.bib b/tests/aux/testBib_immaculate.bib similarity index 100% rename from tests/testBib_immaculate.bib rename to tests/aux/testBib_immaculate.bib diff --git a/tests/cleanedBib.csv b/tests/cleanedBib.csv new file mode 100644 index 0000000..1ca27bc --- /dev/null +++ b/tests/cleanedBib.csv @@ -0,0 +1 @@ +Article,FA,LA,Title,SelfCite,CitationKey diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py new file mode 100644 index 0000000..999665b --- /dev/null +++ b/tests/erroneous/pipeline.py @@ -0,0 +1,37 @@ +import glob +from habanero import Crossref +import sys +import os +from pathlib import Path +wd = Path(os.getcwd()) +sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils') +from preprocessing import * + +cr = Crossref() +#homedir = '/home/jovyan/' +homedir = os.getcwd() + '/' +bib_files = glob.glob(homedir + '*.bib') +paper_aux_file = glob.glob(homedir + '*.aux') +paper_bib_file = 'library_paper.bib' +try: + tex_file = glob.glob(homedir + "*.tex")[0] +except: + print('No optional .tex file found.') + +yourFirstAuthor = 'Stiso, Jennifer ' +yourLastAuthor = 'Bassett, Dani ' +optionalEqualContributors = ['Zhou, Dale'] +checkingPublishedArticle = False + +## end of user input +if paper_aux_file: + find_unused_cites(paper_aux_file) + +bib_data = get_bib_data(homedir) +if checkingPublishedArticle: + get_names_published(homedir, bib_data, cr) +else: + # find and print duplicates + get_duplicates(bib_data) + # get names, remove CDS, find self cites + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/tests/testBib_erroneous.bib b/tests/erroneous/testBib_erroneous.bib similarity index 100% rename from tests/testBib_erroneous.bib rename to tests/erroneous/testBib_erroneous.bib diff --git a/tests/immaculate/cleanedBib_test.csv b/tests/immaculate/cleanedBib_test.csv new file mode 100644 index 0000000..fb49df2 --- /dev/null +++ b/tests/immaculate/cleanedBib_test.csv @@ -0,0 +1,15 @@ +Article,FA,LA,Title,SelfCite,CitationKey +2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory +3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019 +4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network +5,William,William,Moralia Volume VI,N,moralia2005 +6,Danielle,Perry,Curious Minds,N,bassett2022curious +7,Danielle,Jennifer,fake,Y,fake2022 +8,,JH,N-gram language models,N,jurafsky2018n +9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered +10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender +11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance +12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black +13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic +14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian +15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba7ab28fc57decb39b0b40da6dd0f71ec7b9b663 GIT binary patch literal 13102 zcmc&)U2GiJb)LVS{o!)Cr1&e#%EWOkuS!vtVx%eSsFp?1W+G8cDF0|Dq5f@1SITt z&de@%Ny$hGx823uxpU{3?UJJW6Fv0)bUeI>%l`oq zp$N66l-^3r8@TJWRNX9_$-8tp&F?bh4DLoPThEnq$-9B_0LoIe z{B=c`BK?k19uygo#l0YMVgUCckr#uw7ezq~;XW*iVi@-kF(QuPeoTytG2BPRC&h%A zd`B&hi79a$CF9}=aRT=V@uWD3`=t1UuyCIe>atRL>H)^1ESA)0=!zXUw--I(_~)-X zw*wTJtJPJMA(wxtx{AUjTl$W~vwFIsQp4nz!rjCL&ME${APJOBRj5J>!G$ShUB|N~ zjQ6#+8mgQ6o*`2ARLbwAgo&IMn7c|_54Abvt(mqFq(fs@75Ys>&V~k25b2v5^7>u| z&*02Av?=ADsw!KvA{Xl0irfw{nowU>L>8kOXw?JzrejrI%WmGb)~YqfTJxlZ!C1np zH0zEVOk2_kn$opuRX?!2HLFq$?4ato)=g)}&sq2d8+KsXlIhHV7crsLW?;EqK*T$= z$|fk+ns%#D*>Ea1LCNuF@GOZ*#7I97UNd-@Y6fd(J{zgIhw7OJ8s06oUODqI{Yu#= z=Qmdy(yKVWAL+8`M#j>@(q&1KiA>>CAloSAH)_=&GHBG1UUS^Yv>Oe_6{S=;17@ud zvm#SEezO*oi%qxbJ7UF_(%xB-_O0?rU&&h4wQG@j`@SN_@Rf(ZboNy!h=0}(s=l{* zb}r$ye^xl#XDc1@?qpK0tw!w1Y;0k;~>AP<`UI1h9)QQrb2w|{AijQxL6SdG9lCusxuQPff_1MDoyC5JiF>16cTEv;cn8s zt-0q>n-uNMK>sOf8-ek4HPoPPnUQNcjA;qHK zTFqXcwV)6d-qvXD#Ux^_oVA!+E8S*Mit9W`V^9x%yuZKSDx8|OmZTRrmB10to{E%K z>QrgkIu+?4T1rKe70=yvWWe)!x#3i*cJ0yzW^kmVYf&2Puh|tRno8>RR=MuIdVT)0 zQSLRn)^sjQ>B*8F8Pu7`ppHkmgYL29c6c@yHJ zntgVJG_`9-KEy+)>?!vYwiGJvTA+pMt_n+`vshmd+MW)f>B5LDLh=lo5kG56+i0iU zIn+-ONlVvmLnx8;<3Jk%@jxTVzjeRf7$f8^@|wg@CalPQyOcy=_OZT09`FeyCcDlGdpQ zq~~m09)LK;{XU5+B+hPg62O$yrLCqlB#K!JLYQ zW9^9r5|Bhbj|#DAOeAth9_R7CPaf16QX4jnWLN`bSJ{J6yhr>{Tc`W^K?Z3BS1{XW zmiVEhNiw?8Ae*F>z~hP~wVX=|(Cp9%NU*%t4{!o)QN4Jk@%-ujh~Hb6H=UpNmqqU#V6jv%2nj(vf6LOC}FSej52`py^g= znG|+lM_JUV+qG&7>!IUPu`EynEi&jMkwLW%kJCbpHC#T8MKSZLv~cfRq%!mZ7mxIx zkr9{;LR%-h2HSK~mrI17;yJMntrupfRH5td8*Q>d8Cb9^E*1D_DS6J(GcIF_aFh-l z^$Xjl7amcbhNVl3Ay_(-EFIAcj3BkEZY%yja^9q; zp67H=-VE?W&+G*^L0HXH%G6;!df#99}x zr$IHS!LpKlIwY5t?VV4Mw#Vy@tebtfFlwNkxp3flh==ZH>tFCZv zqeH7Z)X8|7^NfB1B8zmC0Xp&G&T~A{F`kkUrM8-m+yUfsYFL@EY**{RTP9#t;9%9Q z1Dx6GPHQmm0=sr5##w%pLTBrKIkjHh1|(Bw*p{G`U9Z-5TH3T#%0ya4MyXq#yy?qN zpbN1{A-fw5?RZT&PZs@R+#!of3siaq20tz=W#s3mIK@0H5BBk>oQ{X$%a^H+T8(Yd z89W@)@Mw${Cx2dg^*5xmww!R?8`bVq)tsFcz$aHpq+pel65DwM=`U~FybLO79OTHIyf5~W_>(j1=-Eibe z2hq-24V#u&j~oJmxGc}s$g7NR542&{YH6Rlun4r1Y5=|k3^Air#cR3&CVruxYI!I@ z!&P^M-yG)EFaWYxNh+~h@aq_pPqSZ1<-r@dgq$Re>N62!Ji5kk4i5o<5R8L50dtL| z6F3%2+UbCoO_6#ZOK70ujqZ*I#$GCcOti!T?oc_oMx}J?yL~L$@am4J%2|st7IIKn zPwp^l7^7O?8hL!PmfeA~fl~vVShsF%R4W_FYho+h1nXJhtl4lItg6o*Qc}Dc6Dh@0 zx-&)LPpyyvjRj$x#&bEHRJ}f5(#nO3SFd~SN{plZ@?>nZS6t8a1KSn06f40_!|`J; z3pi)J=B-L%TRF#{WM8H{0LrwOk<_J*w~@IN`>{rpPFN6`aUaXt!d#T;bnUP-^XpEq z(s}vYpybm!s-z1LB&m8qlds{qPob$hw9K$V3uM*_&MKQ&dhaRh?h>fQGdvPn;s7uS zWX3XM7q|dbH$td7+}%QuBJc!=W5}-t@LPnrmku%vi+M799m@nHooRiU`O02!r^nM4 zxMpDtmT1BWtYC?c{oKTD`6zYhz}Hw>=~D}Hr)I5F57kp_`&a_43k$AACdcaf%G1^@ za=e@PEPXlwv-Z+$4n+9Tn>9O}Gv`x%o7p#&p$)t$eO`4=1$m`uLn^;$3A;6_aA>olGzm4(_VNlC zHTpD=WT5#!>D9q-geictTOR~dVICC5ZW?!W`Pae3E_5`^L-V%VV|`N?*B$#f07a1)wibO9t6k5lfh)%w_Plw;v`W*00$o78llrqw63yoyS3rL z0dG_r4!O-7Haq~QyvQd;v$K%RU!3NKR`R=YkssMGJ}hvY#1Su5S7$2zcBJ0o#s0n; zrCsM1ec_=x{ZI|>>rqzVi@=JGEN8cx6xTtprktiM7To)0IejsP#*sFEG19JFET^s$ zFfM007_2-1%*!#%FE~4kts(&q0f@QgNcb;UD9Q6^83ef>AcC|n=kVOOT9FgR#*GZz zaU`Mg)(C}PgtDzl;EvZI%q=+XV3^iQz5V6(TL?*zg#|d$W}b`G%Hc&b`@*{8I=370 z!atzef8bJD&yWpf7`f8r?aT``uVUBy3p3pYtx+C9GHf6Hj(KR8ROx_D>|nFgOhC17 zBs2quIsK6*Ue79U3sB_VCOMG3EL)GH~G{1;`YI}bIW*s*LjYrSyh z40}6gz(>}L*7?^jera)G@#R@-5l}eRC-%W^*}jEv>~IMymO_vTFL30Va9BhB+8gS-ty4Q@kM ze2S)1Brh^N1ll8=?0|Vo(oByAzxe94>z7xqE-Wrwy?7R=tc#Eii%|W!C+^s0x2;3z z#iI_ORcTx=BdqFVSEBH<2i&0W3&=Y&`%M<}`S<>NGYY=$`{(qzLE; zuDo^vZ>S^(U$>x6qr^a7!&|DA*Lrh>q^!s-#!<)GH#M}^K8uFDZ+DrK{G*W z(*!Cd44T?ZGX~9u*}cq1V9<@e^;X@??&Zj#xvRhRVmpU+1K(5I13;qL5V5IEAkZl# z9Dvc?+X`Ew-Eu^vfD7|LOebQb)boZAV?M@YVO|V@%3ymCxgzHZK_M6-JS2w2NV26< z42JfAM}-lLY#zG@hj?$4Feb3-DE3EjZx3>-!q3>M5F;;&R|C_2Kk3bQdx)tG{fyLx zK#kfUe#g{?(5tbK`b6|KIg*U=G(=zb72?_v`ntT(L;pDF7rONIo#&6x*SCQF z3HJjlUcjXUPlVVrAY2#@hK0}!bGsRM6^WO@Gv^8D7NTG@Chn6dl~uUUiFUGqKm z_$k=O>!i=i?4`No_=!K!aqg}87`k?ltY!6LS@{*%7r84;cM2;3~f6Iu1Vo7VQZnde$JzGO31`Tz)m2|v9gnI=ojN1%~e z%=@0PKz{b#oNN8!BX}ci(1x=^vJDsuVyp!8D4f`pTI)hGc2V^Kt!V71-{CMf-l(o` zoWbZCAZTqOu7#rxU0B7IU6ymc26f-3fEdh6^W5bBwjjnsn* z(63)z>!=6oA3;WlvMDj7*2_y00b7g@*ctSJvb$oS_S}HR>^s5%i5+sVWsuQ9{07Iomy65N}7QZ%1l7c8JJx?Aao4@mP9N22|Rl zWSf#(l-#CdhY|vT@@+~&N*LkZK#n#?NFU2-D7=rEhy7;+ml3c9$c^1Z-Y#U28f61% zyFP6+M%g+J`S2s1u%Xiy(Lkf=R)To4$GapHFDwr;!FaPfp~%?!SE(akqvY3+-~bKG z*68r*L#V)=l{TM9oQq6ZL`35CC?l+Oc z>8#F7<1jJzD~R{(wEedwo%?k}fAfLcUH3LA3iOAIpiI z`=?P54Y3`hAWLEe-=n&;n|xo3178iwt8C{E#h2P-W+=i@z=Ve>*~d;UPIYPft&<04 zvnPu8=&0AnI$}iYPS60lBbMKx#}6p^T}ry)Z?+^cf<@pCkd-O;BtUHh=Z>St$mjMdPxcpHh zJX7;DQ)!pHC$?WwIk*YNQ*C>kdFJ8TDU0qmSMr7T(_9biS8nQ1@ynFG|wW~{4E?>WlTCHP`pb@Eyt@9n~KRvBd zD&7Mpbc^%6&H6zY7oe3^l6dyAX`w*o0l<_DPv1mYxzGWZd`{|Id8q%{K3uSRsKxn1 z70w@SaelR&rByGE68C@Z(}x;=+CD_xm%l|FCmW98b|cV|)tDz8k8YeQeH%%QG>04$ zWiW%oMw*t9VBvw z%A+`yfm-pglbPh)3m;yQ-=g=}_jBlI1Rp@+lhW~8a-P2PBqbIl6w!~*5WPk@Ia?>WF|ACnPZtO{wFfoToEO2WWMs> DWK+_P literal 0 HcmV?d00001 diff --git a/utils/__pycache__/queries.cpython-39.pyc b/utils/__pycache__/queries.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e5b128322dd86be564bcfd5e2eee6253afb9213 GIT binary patch literal 838 zcmZuvO-~d-5bckd*Y; zSAKypfusM0U(g3Ho;Z8spjOWs@6Aa4A1|K4ptH74yO5uP;sDRrWb(KLD5CVk*z^c2M~^E9aa>=G1mtb zIV2T>2wEcsZRk`~QkXjn>ZpofzE7Zw$;N@c{KI>mW48LF4hz7$Ugy&5z(U176S(z- z!(yYi1h>z49ifL7gmwF$LWQ?sai(-mFVKNUbok#cuv~>z&?iS!fAc!U`i#SJpPVyT zfjbxMoL@A)uuB`KvwFJy&WL#T^~*RdAWpTFnG&&;RtXc^GX5|X`Xt`br7>Cz<6)`e zbZQT9QPNl!4y88s3I|DCpi(S!P7l zR%qACg)!+!oc3&KQ}sv-Gga2OP+F0j(^g~b$;LI^aA8wl`FAnk+An(EgMR@o9LXb5 zIJ$LBW8d_+)MHdp<*{#~#YCl&FSsP@BY9Y0AT7x6G!wD6C0993?o;w0JJ}lc|-;B>wNEFk6!o z*Yg_Yg)Z~=I9fj=^ALfsHjQYT1}wsAjV{tvWa~7*FAwMn@6w2`Gaci}R|DFtjg1>{ TANO2)J1=3X#8a>Q1C#s)E*jeE literal 0 HcmV?d00001 diff --git a/utils/preprocessing.py b/utils/preprocessing.py index 890a056..bd2ba0a 100644 --- a/utils/preprocessing.py +++ b/utils/preprocessing.py @@ -1,6 +1,13 @@ import subprocess from pylatexenc.latex2text import LatexNodes2Text import unicodedata +import glob +from pybtex.database.input import bibtex +import os +import csv +from bibtexparser.bparser import BibTexParser +import string +from queries import * def checkcites_output(aux_file): '''take in aux file for tex document, return list of citation keys @@ -23,7 +30,7 @@ def checkcites_output(aux_file): def clean_name(name, flag): """ - :param name: + :param name: string author name flag: utf or latex :return: clean_name """ @@ -42,11 +49,13 @@ def clean_name(name, flag): else: raise ValueError + return clean_name + def removeMiddleName(line): """ - :param line: - :return: + :param line: string author name + :return: the same string, but with the middle name removed """ arr = line.split() last = arr.pop() @@ -62,11 +71,11 @@ def removeMiddleName(line): return str(first + ' ' + middle) -def returnFirstName(line): +def returnMiddletName(line): """ - :param line: - :return: + :param line: string author name + :return: only the middle name """ arr = line.split() n = len(arr) @@ -102,9 +111,9 @@ def convertSpecialCharsToUTF8(text): def namesFromXrefSelfCite(doi, title): """ - :param doi: - :param title: - :return: + :param doi: DOI of published article + :param title: the title of the same article + :return: selfCiteCheck: the number of self citations in a published article (indexed by DOI """ selfCiteCheck = 0 # get cross ref data @@ -131,22 +140,25 @@ def find_unused_cites(paper_aux_file): """ :param paper_aux_file: path to auxfile - :return: """ print(checkcites_output(paper_aux_file)) unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper print("Unused citations: ", unused_in_paper.count('=>')) -def get_bib_data(homedir): +def get_bib_data(homedir, parser=""): """ :param homedir: home directory + parser: a string telling which parser to use (default is not to use bparser) :return: bib_data """ ID = glob.glob(homedir + '*bib') - with open(ID[0]) as bibtex_file: - bib_data = bibtexparser.bparser.BibTexParser(common_strings=True, - ignore_nonstandard_types=False).parse_file(bibtex_file) + if parser == 'bparser': + bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file) + else: + parser = bibtex.Parser() + bib_data = parser.parse_file(ID[0]) + return bib_data def get_duplicates(bib_data): @@ -157,7 +169,7 @@ def get_duplicates(bib_data): """ duplicates = [] - for key in bib_data.entries_dict.keys(): + for key in bib_data.entries.keys(): count = str(bib_data.entries).count("'ID\': \'" + key + "\'") if count > 1: duplicates.append(key) @@ -167,7 +179,7 @@ def get_duplicates(bib_data): ' '.join(map(str, duplicates))) -def get_names_published(outPath, bib_data): +def get_names_published(homedir, bib_data, cr): """ whole pipeline for published papers :return: FA, @@ -176,7 +188,6 @@ def get_names_published(outPath, bib_data): FA = [] LA = [] counter = 1 - selfCiteCount = 0 titleCount = 1 # counterNoDOI = list() # row index (titleCount) of entries with no DOI outPath = homedir + 'cleanedBib.csv' @@ -212,13 +223,13 @@ def get_names_published(outPath, bib_data): articleNum = 0 for doi in citedArticleDOI: try: - FA = namesFromXref(doi, '', 'first') + FA = namesFromXref(cr, doi, '', 'first') except UnboundLocalError: sleep(1) continue try: - LA = namesFromXref(doi, '', 'last') + LA = namesFromXref(cr, doi, '', 'last') except UnboundLocalError: sleep(1) continue @@ -256,14 +267,13 @@ def get_names_published(outPath, bib_data): return FA, LA -def get_names(bib_data): +def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr): """ take bib_data, and get lists of first and last names. should also get self cites and CDS cites :return: FA LA """ counter = 1 - nameCount = 0 outPath = homedir + 'cleanedBib.csv' if os.path.exists(outPath): @@ -329,7 +339,7 @@ def get_names(bib_data): if FA == '' or len(FA.split('.')[0]) <= 1: while True: try: - FA = namesFromXref(doi, title, 'first') + FA = namesFromXref(cr, doi, title, 'first') except UnboundLocalError: sleep(1) continue @@ -337,13 +347,13 @@ def get_names(bib_data): if LA == '' or len(LA.split('.')[0]) <= 1: while True: try: - LA = namesFromXref(doi, title, 'last') + LA = namesFromXref(cr, doi, title, 'last') except UnboundLocalError: sleep(1) continue break - self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors) + selfCite = self_cites(author, yourFirstAuthor,yourLastAuthor, optionalEqualContributors, FA, LA, counter, key) counter += 1 with open(outPath, 'a', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) @@ -351,7 +361,7 @@ def get_names(bib_data): [counter, convertSpecialCharsToUTF8(FA), convertSpecialCharsToUTF8(LA), title, selfCite, key]) -def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors): +def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, FA, LA, counter, key): """ take author list, and find self citations @@ -359,8 +369,11 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor :param yourFirstAuthor: :param yourLastAuthor: :param optionalEqualContributors: + :param FA: + :param LA: :return: """ + if (yourFirstAuthor == 'LastName, FirstName OptionalMiddleInitial') or ( yourLastAuthor == 'LastName, FirstName OptionalMiddleInitial'): raise ValueError("Please enter your manuscript's first and last author names") @@ -381,74 +394,33 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor [clean_name(s.rich_last_names, 'utf'), LA]).replace("'","")] # I was in the process of cleaning all thisup when we stopped - selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ - convertLatexSpecialChars( - str(s.rich_last_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", ""), - convertLatexSpecialChars( - str(s.rich_first_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", - "")]).replace( - "'", "")] - selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ - convertSpecialCharsToUTF8( - str(s.rich_last_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", ""), - convertSpecialCharsToUTF8( - str(s.rich_first_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", - "")]).replace( - "'", "")] - selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in str([ - convertSpecialCharsToUTF8( - str(s.rich_last_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", ""), - FA]).replace("'", - "")] + selfCiteCheck2 = [s for s in author if removeMiddleName(yourFirstAuthor) in + str([clean_name(s.rich_last_names, 'utf'), + clean_name(s.rich_first_names, 'utf')] + ).replace("'", "")] + selfCiteCheck2a = [s for s in author if removeMiddleName(yourFirstAuthor) in + str( + [clean_name(s.rich_last_names, 'utf'), + clean_name(s.rich_first_names, 'utf')] + ).replace("'", "")] + selfCiteCheck2b = [s for s in author if removeMiddleName(yourFirstAuthor) in + str( + [clean_name(s.rich_last_names, 'utf'), + FA]).replace("'","")] nameCount = 0 if optionalEqualContributors != ( 'LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial'): for name in optionalEqualContributors: - selfCiteCheck3 = [s for s in author if removeMiddleName(name) in str([convertLatexSpecialChars( - str(s.rich_last_names)[7:-3]).replace("', Protected('", "").replace("'), '", ""), - convertLatexSpecialChars( - str(s.rich_first_names)[ - 7:-3]).replace( - "', Protected('", - "").replace("'), '", - "")]).replace( - "'", "")] - selfCiteCheck3a = [s for s in author if removeMiddleName(name) in str([ - convertSpecialCharsToUTF8( - str(s.rich_last_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", ""), - convertSpecialCharsToUTF8( - str(s.rich_first_names)[ - 7:-3]).replace( - "', Protected('", - "").replace( - "'), '", - "")]).replace("'", - "")] + selfCiteCheck3 = [s for s in author if removeMiddleName(name) in + str( [clean_name(s.rich_last_names, 'utf'), + clean_name(s.rich_first_names, 'utf')] + ).replace("'", "")] + selfCiteCheck3a = [s for s in author if removeMiddleName(name) in + str( + [clean_name(s.rich_last_names, 'utf'), + clean_name(s.rich_first_names, 'utf')] + ).replace("'", "")] if len(selfCiteCheck3) > 0: nameCount += 1 if len(selfCiteCheck3a) > 0: @@ -469,6 +441,8 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor else: print(str(counter) + ": " + key) + return selfCite + From f4d2c9fdf68e4f7b3ea9345a37949e632dcd02d1 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Tue, 17 May 2022 11:28:04 -0400 Subject: [PATCH 10/47] automatically removed duplicates --- tests/aux/cleanedBib.csv | 28 +-- tests/aux/pipeline.py | 4 +- tests/aux/testBib_immaculate_clean.bib | 237 +++++++++++++++++++++++++ 3 files changed, 253 insertions(+), 16 deletions(-) create mode 100644 tests/aux/testBib_immaculate_clean.bib diff --git a/tests/aux/cleanedBib.csv b/tests/aux/cleanedBib.csv index fb49df2..7f322e7 100644 --- a/tests/aux/cleanedBib.csv +++ b/tests/aux/cleanedBib.csv @@ -1,15 +1,15 @@ Article,FA,LA,Title,SelfCite,CitationKey -2,Gyorgy,Edvard,Memory navigation and theta rhythm in the hippocampal-entorhinal system,N,buzsaki2013memory -3,Jamie,Dina,“I don't see gender”: Conceptualizing a gendered system of academic publishing,N,Lundine2019 -4,Perry,Danielle,Network architectures supporting learnability,N,zurn2020network -5,William,William,Moralia Volume VI,N,moralia2005 -6,Danielle,Perry,Curious Minds,N,bassett2022curious -7,Danielle,Jennifer,fake,Y,fake2022 -8,,JH,N-gram language models,N,jurafsky2018n -9,Sara,Holly,Gendered citation patterns in international relations journals,N,mitchell2013gendered -10,Paula,Rachel,Gender Disparity in Citations in High-Impact Journal Articles,N,chatterjee2021gender -11,Jacqueline,Bradley,Gender (Im)balance in Citation Practices in Cognitive Neuroscience,N,fulvio2021imbalance -12,Denzel,Ketanji,Test of ethnicolr,N,ethnicolr2022black -13,Rafael,Alexandria,Test of ethnicolr,N,ethnicolr2022hispanic -14,Andrew,Michelle,Test of ethnicolr,N,ethnicolr2022asian -15,Nicolas,Meryl,Test of ethnicolr,N,ethnicolr2022white +2,Danielle,Perry,,N,bassett2022curious +3,Gyorgy,Edvard,,N,buzsaki2013memory +4,Paula,Rachel,,N,chatterjee2021gender +5,Andrew,Michelle,,N,ethnicolr2022asian +6,Denzel,Ketanji,,N,ethnicolr2022black +7,Rafael,Alexandria,,N,ethnicolr2022hispanic +8,Nicolas,Meryl,,N,ethnicolr2022white +9,Danielle,Jennifer,,Y,fake2022 +10,Jacqueline,Bradley,,N,fulvio2021imbalance +11,,JH,,N,jurafsky2018n +12,,Dina,,N,Lundine2019 +13,Sara,Holly,,N,mitchell2013gendered +14,William,William,,N,moralia2005 +15,Perry,Danielle,,N,zurn2020network diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py index 999665b..4bd4640 100644 --- a/tests/aux/pipeline.py +++ b/tests/aux/pipeline.py @@ -27,11 +27,11 @@ if paper_aux_file: find_unused_cites(paper_aux_file) -bib_data = get_bib_data(homedir) +bib_data = get_bib_data(bib_files[0]) if checkingPublishedArticle: get_names_published(homedir, bib_data, cr) else: # find and print duplicates - get_duplicates(bib_data) + bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/tests/aux/testBib_immaculate_clean.bib b/tests/aux/testBib_immaculate_clean.bib new file mode 100644 index 0000000..dc0ece8 --- /dev/null +++ b/tests/aux/testBib_immaculate_clean.bib @@ -0,0 +1,237 @@ +@inproceedings{ambekar2009name, + author = {Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven}, + booktitle = {Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining}, + pages = {49--58}, + title = {Name-ethnicity classification from open sources}, + year = {2009} +} + +@book{bassett2022curious, + author = {Danielle S. Bassett and Perry Zurn}, + publisher = {MIT Press}, + title = {Curious Minds}, + year = {2022} +} + +@article{bertolero2021racial, + author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.}, + journal = {bioRxiv}, + title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender}, + xoi = {10.1101/2020.10.12.336230}, + year = {2020} +} + +@article{buzsaki2013memory, + author = {Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Memory, navigation and theta rhythm in the hippocampal-entorhinal system}, + volume = {16}, + year = {2013} +} + +@article{caplar2017quantitative, + author = {Caplar, Neven and Tacchella, Sandro and Birrer, Simon}, + journal = {Nature Astronomy}, + number = {6}, + pages = {0141}, + publisher = {Nature Publishing Group}, + title = {Quantitative evaluation of gender bias in astronomical publications from citation counts}, + volume = {1}, + year = {2017} +} + +@article{chatterjee2021gender, + author = {Chatterjee, Paula and Werner, Rachel M}, + journal = {JAMA Netw Open}, + number = {7}, + pages = {e2114509}, + title = {Gender Disparity in Citations in High-Impact Journal Articles}, + volume = {4}, + year = {2021} +} + +@article{dion2018gendered, + author = {Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin}, + journal = {Political Analysis}, + number = {3}, + pages = {312--327}, + publisher = {Cambridge University Press}, + title = {Gendered citation patterns across political science and social science methodology fields}, + volume = {26}, + year = {2018} +} + +@article{Dworkin2020.01.03.894378, + abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.}, + author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.}, + doi = {10.1101/2020.01.03.894378}, + elocation-id = {2020.01.03.894378}, + eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + title = {The extent and drivers of gender imbalance in neuroscience reference lists}, + url = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378}, + year = {2020} +} + +@article{ethnicolr2022asian, + author = {Wang, Andrew and Yeoh, Michelle}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022black, + author = {Washington, Denzel and Brown-Jackson, Ketanji}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022hispanic, + author = {Cruz, Rafael and Ocasio-Cortez, Alexandria}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022white, + author = {Coppola, Nicolas and Streep, Meryl}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@book{fake2022, + author = {Danielle S. Bassett and Dale Zhou and Jennifer Stiso}, + publisher = {MIT Press}, + title = {fake}, + year = {2022} +} + +@article{fulvio2021imbalance, + author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R}, + journal = {J Cogn Neurosci}, + number = {1}, + pages = {3-7}, + title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience}, + volume = {33}, + year = {2021} +} + +@article{jurafsky2018n, + author = {Jurafsky, D and Martin, JH}, + journal = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition}, + title = {N-gram language models}, + year = {2018} +} + +@article{Lundine2019, + abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.}, + author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina}, + doi = {10.1016/j.socscimed.2019.112388}, + file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf}, + issn = {18735347}, + journal = {Social Science and Medicine}, + keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing}, + number = {January}, + pages = {112388}, + pmid = {31288167}, + publisher = {Elsevier}, + title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}}, + url = {https://doi.org/10.1016/j.socscimed.2019.112388}, + volume = {235}, + year = {2019} +} + +@article{maliniak2013gender, + author = {Maliniak, Daniel and Powers, Ryan and Walter, Barbara F}, + journal = {International Organization}, + number = {4}, + pages = {889--922}, + publisher = {Cambridge University Press}, + title = {The gender citation gap in international relations}, + volume = {67}, + year = {2013} +} + +@article{mitchell2013gendered, + author = {Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly}, + journal = {International Studies Perspectives}, + number = {4}, + pages = {485--492}, + publisher = {Blackwell Publishing Ltd Oxford, UK}, + title = {Gendered citation patterns in international relations journals}, + volume = {14}, + year = {2013} +} + +@book{moralia2005, + author = {Plutarch, Helmbold, William}, + publisher = {Harvard University Press}, + title = {Moralia, Volume VI}, + year = {1939} +} + +@article{sood2018predicting, + author = {Sood, Gaurav and Laohaprapanon, Suriyan}, + journal = {arXiv preprint arXiv:1805.02109}, + title = {Predicting race and ethnicity from the sequence of characters in a name}, + year = {2018} +} + +@article{wang2021gendered, + author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.}, + doi = {10.1080/23808985.2021.1960180}, + journal = {Annals of the International Communication Association}, + title = {Gendered citation practices in the field of communication}, + year = {2021} +} + +@software{zhou_dale_2020_3672110, + author = {Zhou, Dale and +Cornblath, Eli J. and +Stiso, Jennifer and +Teich, Erin G. and +Dworkin, Jordan D. and +Blevins, Ann S. and +Bassett, Danielle S.}, + doi = {10.5281/zenodo.3672110}, + month = {February}, + publisher = {Zenodo}, + title = {Gender Diversity Statement and Code Notebook v1.0}, + url = {https://doi.org/10.5281/zenodo.3672110}, + version = {v1.0}, + year = {2020} +} + +@article{zurn2020network, + author = {Zurn, Perry and Bassett, Danielle S}, + journal = {Philosophical Transactions of the Royal Society B}, + number = {1796}, + pages = {20190323}, + publisher = {The Royal Society}, + title = {Network architectures supporting learnability}, + volume = {375}, + year = {2020} +} + From 3a0c1eb16b879d11e66da04b62c1dfdeee750221 Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Tue, 17 May 2022 12:32:27 -0400 Subject: [PATCH 11/47] updating pipelines --- cleanBib.ipynb | 4 +- tests/erroneous/cleanedBib.csv | 15 ++ tests/erroneous/pipeline.py | 4 +- tests/erroneous/testBib_erroneous_clean.bib | 237 ++++++++++++++++++ tests/immaculate/cleanedBib.csv | 15 ++ tests/immaculate/pipeline.py | 4 +- .../__pycache__/preprocessing.cpython-39.pyc | Bin 13102 -> 13808 bytes utils/preprocessing.py | 49 +++- 8 files changed, 311 insertions(+), 17 deletions(-) create mode 100644 tests/erroneous/cleanedBib.csv create mode 100644 tests/erroneous/testBib_erroneous_clean.bib create mode 100644 tests/immaculate/cleanedBib.csv diff --git a/cleanBib.ipynb b/cleanBib.ipynb index a5f6cff..4b9593d 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -137,12 +137,12 @@ "if paper_aux_file:\n", " find_unused_cites(paper_aux_file)\n", "\n", - "bib_data = get_bib_data(homedir)\n", + "bib_data = get_bib_data(bib_files[0])\n", "if checkingPublishedArticle:\n", " get_names_published(homedir, bib_data, cr)\n", "else:\n", " # find and print duplicates\n", - " get_duplicates(bib_data)\n", + " bib_data = get_duplicates(bib_data, bib_files[0])\n", " # get names, remove CDS, find self cites\n", " get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)" ] diff --git a/tests/erroneous/cleanedBib.csv b/tests/erroneous/cleanedBib.csv new file mode 100644 index 0000000..7f322e7 --- /dev/null +++ b/tests/erroneous/cleanedBib.csv @@ -0,0 +1,15 @@ +Article,FA,LA,Title,SelfCite,CitationKey +2,Danielle,Perry,,N,bassett2022curious +3,Gyorgy,Edvard,,N,buzsaki2013memory +4,Paula,Rachel,,N,chatterjee2021gender +5,Andrew,Michelle,,N,ethnicolr2022asian +6,Denzel,Ketanji,,N,ethnicolr2022black +7,Rafael,Alexandria,,N,ethnicolr2022hispanic +8,Nicolas,Meryl,,N,ethnicolr2022white +9,Danielle,Jennifer,,Y,fake2022 +10,Jacqueline,Bradley,,N,fulvio2021imbalance +11,,JH,,N,jurafsky2018n +12,,Dina,,N,Lundine2019 +13,Sara,Holly,,N,mitchell2013gendered +14,William,William,,N,moralia2005 +15,Perry,Danielle,,N,zurn2020network diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py index 999665b..4bd4640 100644 --- a/tests/erroneous/pipeline.py +++ b/tests/erroneous/pipeline.py @@ -27,11 +27,11 @@ if paper_aux_file: find_unused_cites(paper_aux_file) -bib_data = get_bib_data(homedir) +bib_data = get_bib_data(bib_files[0]) if checkingPublishedArticle: get_names_published(homedir, bib_data, cr) else: # find and print duplicates - get_duplicates(bib_data) + bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/tests/erroneous/testBib_erroneous_clean.bib b/tests/erroneous/testBib_erroneous_clean.bib new file mode 100644 index 0000000..dc0ece8 --- /dev/null +++ b/tests/erroneous/testBib_erroneous_clean.bib @@ -0,0 +1,237 @@ +@inproceedings{ambekar2009name, + author = {Ambekar, Anurag and Ward, Charles and Mohammed, Jahangir and Male, Swapna and Skiena, Steven}, + booktitle = {Proceedings of the 15th ACM SIGKDD international conference on Knowledge Discovery and Data Mining}, + pages = {49--58}, + title = {Name-ethnicity classification from open sources}, + year = {2009} +} + +@book{bassett2022curious, + author = {Danielle S. Bassett and Perry Zurn}, + publisher = {MIT Press}, + title = {Curious Minds}, + year = {2022} +} + +@article{bertolero2021racial, + author = {Bertolero, Maxwell A. and Dworkin, Jordan D. and David, Sophia U. and Lloreda, Claudia López and Srivastava, Pragya and Stiso, Jennifer and Zhou, Dale and Dzirasa, Kafui and Fair, Damien A. and Kaczkurkin, Antonia N. and Marlin, Bianca Jones and Shohamy, Daphna and Uddin, Lucina Q. and Zurn, Perry and Bassett, Danielle S.}, + journal = {bioRxiv}, + title = {Racial and ethnic imbalance in neuroscience reference lists and intersections with gender}, + xoi = {10.1101/2020.10.12.336230}, + year = {2020} +} + +@article{buzsaki2013memory, + author = {Buzs{\'a}ki, Gy{\"o}rgy and Moser, Edvard I}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Memory, navigation and theta rhythm in the hippocampal-entorhinal system}, + volume = {16}, + year = {2013} +} + +@article{caplar2017quantitative, + author = {Caplar, Neven and Tacchella, Sandro and Birrer, Simon}, + journal = {Nature Astronomy}, + number = {6}, + pages = {0141}, + publisher = {Nature Publishing Group}, + title = {Quantitative evaluation of gender bias in astronomical publications from citation counts}, + volume = {1}, + year = {2017} +} + +@article{chatterjee2021gender, + author = {Chatterjee, Paula and Werner, Rachel M}, + journal = {JAMA Netw Open}, + number = {7}, + pages = {e2114509}, + title = {Gender Disparity in Citations in High-Impact Journal Articles}, + volume = {4}, + year = {2021} +} + +@article{dion2018gendered, + author = {Dion, Michelle L and Sumner, Jane Lawrence and Mitchell, Sara McLaughlin}, + journal = {Political Analysis}, + number = {3}, + pages = {312--327}, + publisher = {Cambridge University Press}, + title = {Gendered citation patterns across political science and social science methodology fields}, + volume = {26}, + year = {2018} +} + +@article{Dworkin2020.01.03.894378, + abstract = {Like many scientific disciplines, neuroscience has increasingly attempted to confront pervasive gender imbalances within the field. While much of the conversation has centered around publishing and conference participation, recent research in other fields has called attention to the prevalence of gender bias in citation practices. Because of the downstream effects that citations can have on visibility and career advancement, understanding and eliminating gender bias in citation practices is vital for addressing inequity in a scientific community. In this study, we sought to determine whether there is evidence of gender bias in the citation practices of neuroscientists. Utilizing data from five top neuroscience journals, we indeed find that reference lists tend to include more papers with men as first and last author than would be expected if gender was not a factor in referencing. Importantly, we show that this overcitation of men and undercitation of women is driven largely by the citation practices of men, and is increasing with time despite greater diversity in the academy. We develop a co-authorship network to determine the degree to which homophily in researchers{\textquoteright} social networks explains gendered citation practices and we find that men tend to overcite other men even when their social networks are representative of the field. We discuss possible mechanisms and consider how individual researchers might incorporate these findings into their own referencing practices.}, + author = {Dworkin, Jordan D. and Linn, Kristin A. and Teich, Erin G. and Zurn, Perry and Shinohara, Russell T. and Bassett, Danielle S.}, + doi = {10.1101/2020.01.03.894378}, + elocation-id = {2020.01.03.894378}, + eprint = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378.full.pdf}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + title = {The extent and drivers of gender imbalance in neuroscience reference lists}, + url = {https://www.biorxiv.org/content/early/2020/01/11/2020.01.03.894378}, + year = {2020} +} + +@article{ethnicolr2022asian, + author = {Wang, Andrew and Yeoh, Michelle}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022black, + author = {Washington, Denzel and Brown-Jackson, Ketanji}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022hispanic, + author = {Cruz, Rafael and Ocasio-Cortez, Alexandria}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@article{ethnicolr2022white, + author = {Coppola, Nicolas and Streep, Meryl}, + journal = {Nature neuroscience}, + number = {2}, + pages = {130}, + publisher = {Nature Publishing Group}, + title = {Test of ethnicolr}, + volume = {16}, + year = {2013} +} + +@book{fake2022, + author = {Danielle S. Bassett and Dale Zhou and Jennifer Stiso}, + publisher = {MIT Press}, + title = {fake}, + year = {2022} +} + +@article{fulvio2021imbalance, + author = {Fulvio, Jacqueline M and Akinnola, Ileri and Postle, Bradley R}, + journal = {J Cogn Neurosci}, + number = {1}, + pages = {3-7}, + title = {Gender (Im)balance in Citation Practices in Cognitive Neuroscience}, + volume = {33}, + year = {2021} +} + +@article{jurafsky2018n, + author = {Jurafsky, D and Martin, JH}, + journal = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition}, + title = {N-gram language models}, + year = {2018} +} + +@article{Lundine2019, + abstract = {Academic experts share their ideas, as well as contribute to advancing health science by participating in publishing as an author, reviewer and editor. The academy shapes and is shaped by knowledge produced within it. As such, the production of scientific knowledge can be described as part of a socially constructed system. Like all socially constructed systems, scientific knowledge production is influenced by gender. This study investigated one layer of this system through an analysis of journal editors' understanding of if and how gender influences editorial practices in peer reviewed health science journals. The study involved two stages: 1) exploratory in-depth qualitative interviews with editors at health science journals; and 2) a nominal group technique (NGT) with experts working on gender in research, academia and the journal peer review process. Our findings indicate that some editors had not considered the impact of gender on their editorial work. Many described how they actively strive to be ‘gender blind,' as this was seen as a means to be objective. This view fails to recognize how broader social structures operate to produce systemic inequities. None of the editors or publishers in this study were collecting gender or other social indicators as part of the article submission process. These findings suggest that there is room for editors and publishers to play a more active role in addressing structural inequities in academic publishing to ensure a diversity of knowledge and ideas are reflected.}, + author = {Lundine, J. and Bourgeault, Ivy Lynn and Glonti, Ketevan and Hutchinson, Eleanor and Balabanova, Dina}, + doi = {10.1016/j.socscimed.2019.112388}, + file = {:Users/stiso/Downloads/1-s2.0-S0277953619303739-main.pdf:pdf}, + issn = {18735347}, + journal = {Social Science and Medicine}, + keywords = {Feminist science studies,Gender inequity,Health sciences,Journalology,Peer review,Publishing}, + number = {January}, + pages = {112388}, + pmid = {31288167}, + publisher = {Elsevier}, + title = {{“I don't see gender”: Conceptualizing a gendered system of academic publishing}}, + url = {https://doi.org/10.1016/j.socscimed.2019.112388}, + volume = {235}, + year = {2019} +} + +@article{maliniak2013gender, + author = {Maliniak, Daniel and Powers, Ryan and Walter, Barbara F}, + journal = {International Organization}, + number = {4}, + pages = {889--922}, + publisher = {Cambridge University Press}, + title = {The gender citation gap in international relations}, + volume = {67}, + year = {2013} +} + +@article{mitchell2013gendered, + author = {Mitchell, Sara McLaughlin and Lange, Samantha and Brus, Holly}, + journal = {International Studies Perspectives}, + number = {4}, + pages = {485--492}, + publisher = {Blackwell Publishing Ltd Oxford, UK}, + title = {Gendered citation patterns in international relations journals}, + volume = {14}, + year = {2013} +} + +@book{moralia2005, + author = {Plutarch, Helmbold, William}, + publisher = {Harvard University Press}, + title = {Moralia, Volume VI}, + year = {1939} +} + +@article{sood2018predicting, + author = {Sood, Gaurav and Laohaprapanon, Suriyan}, + journal = {arXiv preprint arXiv:1805.02109}, + title = {Predicting race and ethnicity from the sequence of characters in a name}, + year = {2018} +} + +@article{wang2021gendered, + author = {Wang, Xinyi and Dworkin, Jordan D. and Zhou, Dale and Stiso, Jennifer and Falk, Emily B and Bassett, Danielle S. and Zurn, Perry and Lydon-Staley, David M.}, + doi = {10.1080/23808985.2021.1960180}, + journal = {Annals of the International Communication Association}, + title = {Gendered citation practices in the field of communication}, + year = {2021} +} + +@software{zhou_dale_2020_3672110, + author = {Zhou, Dale and +Cornblath, Eli J. and +Stiso, Jennifer and +Teich, Erin G. and +Dworkin, Jordan D. and +Blevins, Ann S. and +Bassett, Danielle S.}, + doi = {10.5281/zenodo.3672110}, + month = {February}, + publisher = {Zenodo}, + title = {Gender Diversity Statement and Code Notebook v1.0}, + url = {https://doi.org/10.5281/zenodo.3672110}, + version = {v1.0}, + year = {2020} +} + +@article{zurn2020network, + author = {Zurn, Perry and Bassett, Danielle S}, + journal = {Philosophical Transactions of the Royal Society B}, + number = {1796}, + pages = {20190323}, + publisher = {The Royal Society}, + title = {Network architectures supporting learnability}, + volume = {375}, + year = {2020} +} + diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv new file mode 100644 index 0000000..7f322e7 --- /dev/null +++ b/tests/immaculate/cleanedBib.csv @@ -0,0 +1,15 @@ +Article,FA,LA,Title,SelfCite,CitationKey +2,Danielle,Perry,,N,bassett2022curious +3,Gyorgy,Edvard,,N,buzsaki2013memory +4,Paula,Rachel,,N,chatterjee2021gender +5,Andrew,Michelle,,N,ethnicolr2022asian +6,Denzel,Ketanji,,N,ethnicolr2022black +7,Rafael,Alexandria,,N,ethnicolr2022hispanic +8,Nicolas,Meryl,,N,ethnicolr2022white +9,Danielle,Jennifer,,Y,fake2022 +10,Jacqueline,Bradley,,N,fulvio2021imbalance +11,,JH,,N,jurafsky2018n +12,,Dina,,N,Lundine2019 +13,Sara,Holly,,N,mitchell2013gendered +14,William,William,,N,moralia2005 +15,Perry,Danielle,,N,zurn2020network diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py index 999665b..4bd4640 100644 --- a/tests/immaculate/pipeline.py +++ b/tests/immaculate/pipeline.py @@ -27,11 +27,11 @@ if paper_aux_file: find_unused_cites(paper_aux_file) -bib_data = get_bib_data(homedir) +bib_data = get_bib_data(bib_files[0]) if checkingPublishedArticle: get_names_published(homedir, bib_data, cr) else: # find and print duplicates - get_duplicates(bib_data) + bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc index ba7ab28fc57decb39b0b40da6dd0f71ec7b9b663..b95c194989df590caedee869c0a591ccecbeda2f 100644 GIT binary patch delta 2660 zcmZuzU2Ggz6~1R?_Q&hp_1ccTYkO@^LhH>Yj^miWU{b3Rp@~&m8d{Qg)3B`fj_t{g zch`4j>)MQKnhlC0T55CEDx|cmN%;u^q-uqP0Of%Pq$2V2zyr(!LOirWyi^KL2w~2d zb(5H8cIUfi&i}ploICT=x%W%)LM#?xc*)<@XM?A=;%V@<4(Havisqu6@qiV3hT#ZS zJlDa4JamQS5|j}P!Pj{SU)58*h#hq;bp`t5(+R3P5GuKpZAZ=MPK0pNIZe`pKfbFvw3 zLSD|dy$1`;&qwYku+;oaECYit<1FkT%_R_pFo>s+@RYF72aBq|pc4KKKxNi>mJe+`vvf8E1jNU33nDB>LQ ziVDI9R~QMp$)o)-U~HaYHtZ5>z&1O{E?&RJDDz8d9U2;k4VJ_?B`IgsRk>Qy#AR1$ z1U8TZ%n3F^SJ@nJjR&r*hso@uKWQH9V{4kL;bMfornG2u-n$kcorbUJ2Ksy_OP3xT z&<_@O+IYAQmtiUHL|ml~JXq2;L5w)0H-qSOqFqN_S!3l0 z($Jz^hI^M-U5@B@Ifas;&ziG&F8`i5wca8NPC;KborOxx(Rr4Y4yStaKSRbHyxHo3dBe%y%VX?>RVfxMdkP8Dc8Zn7>Xe`?o?Jcn z#d2ZRGIXb++qGiRuT4NWO@E%jC?1Wify$WV^iRZdQ42 z79CH+<*~&OioDRSKsP8 zImYv?HuN-JTdev`mko=vr2h=*NA4}YAIwut{=9e`MY&MfSk9x5Tzkur-}U~!iX7SZ zl6`#d2L1qf_87uA3uzFC6rM~Gbx5HTYM+{hh!O|n?EvMWeI&$1SxTM)A)Cn=n31tmx?Mbr`~(&U%08W18d@bamw-yo z&(2rr@>QC@O-YCdgG9JI7^6P}vr_iI(C>v7DvP+^qO~lKk%~i5lbL}}!DU$(*c$d7 z`!nvQ8Pf@8=c~0@EYk&p|50|PUx%&cPtwN~NXmDIK8rXyeDsqkp4!IV?8Dy2a~L)i z@z>QZr_T+zoo<5n)wQ+QIaWTPF{gi1S@~0|znua3&Ef3R|8J{`srzjE(Kde9R$ajZ zlYY(s2RdhF*rvLKSXRb~nxZEsE#vyB(4ZTugEJ_f)!%u@qIuGq`z0wBiK-A(36==# z=JfvCDm*8{4yfB`C<%hgT-POk@YaRe}wI zYY1M%HmtdP(ZsVz=*Y-+i?5ULtkg%wCSM|w-gn{~1it?ZNa3p&8&cS&vqFWMcTC4Z z2Pa5rcLz5{J`M-v?U4*9^4*bZ@J6$mxv0W<`TIkMM_(nyVT4m{SW)QEnhnNC2ZP%9_>$3+VA)ciG&9;f<6kw znoN)Xy61HwTiZsIzv&m`o$0R2KX% znCzg`Cq{yvHg8pC<>M2BLBD=ql;w%xp?Na9j*tuCsZQ^RT%e3c<%%dGQ{I@!_RyD# hC=qNCyg_h7{&`{q#^ugLuk8G2Z>Tez4WAF^{|7(sjL851 delta 2003 zcmZvdO>7fK6oB{bu76_UbrS5vc5H_vEzSl=C`~9tkP9i4N=-qb5DXzCo=xKX*%>cC zah-*IYF%MgX z(#I;97iAUGm=9$&t76qCYgi3ijdC^fvj9pzYh-mScp4G`7Ghzv)UtXOL0QM5EQT`3 z8rT|?AqIVt-t-q0(yN2zEBQglP+9t1xeX924LN=S*j#$aRRnlleCsa3fH+a{CybN^ zJr86^m-@VMm_LWnStsRiRDv7$9@Ka}A}N#caZ^#!vjd-O55kldrM$$TPFjLFIS)*p zka?>KOkpZ>oR=46Q^65sy4F+|9ZS+2|kBBTx01e zgQd9TEayweyL-d~Uwr-&`eiE-l1sz2`eiq0GT)E=B+d4u(PS=s?8P!vMyY&hTv~)B zsYe=Uo0CnM`UO~2806(TiQ!XT7E>lvuHj4~8f-aNe0NX0Yfbz=i1QY#!`_ozxA?7Q`(PJe zi&G31!K!m z0rCxq8GkT(1Jx{Lkz6Wh5X3#Jlq&)CM|_}k$G=xqEI1*8q2J?{+_uqzBriu(iduwK zv2E0tHcn=E_dV=BkVGiF3k)$7J_zID>u{vb@@$(-WzA%ID%1Thg-s!HaS;p0grmMU zmL}T_!Ol!cIY&GsWvVn&FF>=L+?YxqH!L-sF{XtUjYGfK9DS#j_mLN^mG2~s62`=j z(XEgZ?pP6K#6m0zUy0kXYjCEtuVIS}72k6euz z%scYc`6?DFsB_)}(sYw5nV}2v?5`lv&kT#LP5O#hg?B#_8^&1eX{<8a@ig{oQ$5s+ zFPnC`B6)8C%!p8$zklHfRX#|dPm>=d3=@(BdPn(4>3;J!3Z%rX_?E6|wCGW8(!epo zal#40Ny024M|hi%C-f5zipG}NDj%uygj0lf#jTb)XL&Ql-IlgMfvl$q?-T4qhlNjX za6aDR4!sjJF|Nk}#6|rqTq;Fc2NXCUavRpI|A1l|5MRiAob(Loz;~0r{Fvv&&l?t6 z4$y$T@XMrqOt`wzxq!|=acW~9EQm*Kfz~VJLi5-Wy3EH&f6gptM0b1k#RVFDIsw-6 z8X1pLszI~%+oqAlT?IcCsJC;B_}z<#vw0m2yq-R-<6E_bfoxE@u!sqnU1M%i8C W4!ZsB)$R)XMcfshDzx;wkNgkYHNSNL diff --git a/utils/preprocessing.py b/utils/preprocessing.py index bd2ba0a..b995478 100644 --- a/utils/preprocessing.py +++ b/utils/preprocessing.py @@ -6,8 +6,10 @@ import os import csv from bibtexparser.bparser import BibTexParser +import bibtexparser import string from queries import * +import numpy as np def checkcites_output(aux_file): '''take in aux file for tex document, return list of citation keys @@ -145,38 +147,62 @@ def find_unused_cites(paper_aux_file): unused_in_paper = checkcites_output(paper_aux_file) # get citations in library not used in paper print("Unused citations: ", unused_in_paper.count('=>')) -def get_bib_data(homedir, parser=""): +def get_bib_data(filename, parser="bparser"): """ :param homedir: home directory parser: a string telling which parser to use (default is not to use bparser) :return: bib_data """ - ID = glob.glob(homedir + '*bib') + if parser == 'bparser': - bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(bibtex_file) + bib_data = BibTexParser(common_strings=True, ignore_nonstandard_types=False).parse_file(open(filename)) else: + # this one will error if you have duplicates parser = bibtex.Parser() - bib_data = parser.parse_file(ID[0]) + bib_data = parser.parse_file(filename) return bib_data -def get_duplicates(bib_data): +def get_duplicates(bib_data, filename): """ take bib_data, and get duplicates :param homedir: home directory - :return: + :return: bib_data without duplicates """ duplicates = [] - for key in bib_data.entries.keys(): + for key in bib_data.entries_dict.keys(): count = str(bib_data.entries).count("'ID\': \'" + key + "\'") if count > 1: duplicates.append(key) + # remove from data + idx = np.where([x['ID'] == key for x in bib_data.entries])[0] + # remove first entry, so we keep that one + idx = idx[1:] + for i in idx: + bib_data.entries.remove(bib_data.entries[i]) + + # check that we got the duplicate + if (str(bib_data.entries).count("'ID\': \'" + key + "\'")) > 1: + raise ValueError("Unable to successfully remove duplicates") + if len(duplicates) > 0: - raise ValueError("In your .bib file, we found and removed duplicate entries for:", - ' '.join(map(str, duplicates))) + print("In your .bib file, we found and removed duplicate entries for the following entries:\n " + + ' '.join(map(str, duplicates)) + + "\n If this is incorrect, please edit you .bib file to give unique identifiers for all unique references:") + + # write new data to file + new_bib = filename[:-4] + '_clean.bib' + with open(new_bib, 'w') as bibtex_file: + bibtexparser.dump(bib_data, bibtex_file) + + # reparse + bib_data = get_bib_data(new_bib, "") + else: + bib_data = get_bib_data(filename, "") + return bib_data def get_names_published(homedir, bib_data, cr): @@ -305,6 +331,7 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC author = bib_data.entries[key].persons['author'] except: author = bib_data.entries[key].persons['editor'] + FA = author[0].rich_first_names LA = author[-1].rich_first_names FA = convertLatexSpecialChars(str(FA)[7:-3]).translate(str.maketrans('', '', string.punctuation)).replace( @@ -328,12 +355,12 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC # check that we got a name (not an initial) from the bib file, if not try using the title in the crossref API try: - title = bib_data.entries[key].fields['title'].replace(',', '').\ + title = bib_data.entries_dict[key].fields['title'].replace(',', '').\ replace(',', '').replace('{', '').replace('}','') except: title = '' try: - doi = bib_data.entries[key].fields['doi'] + doi = bib_data.entries_dict[key].fields['doi'] except: doi = '' if FA == '' or len(FA.split('.')[0]) <= 1: From 18a7e28f45986ee3425c79bfcce88542b558663d Mon Sep 17 00:00:00 2001 From: murphyka Date: Tue, 24 May 2022 14:50:03 -0400 Subject: [PATCH 12/47] Removed the R code and added code to check the number of credits allocated to the API key --- cleanBib.ipynb | 109 ++++++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 46 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 4b9593d..01e4ded 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -187,50 +187,67 @@ }, "outputs": [], "source": [ - "genderAPI_key <- '&key=YOUR ACCOUNT KEY HERE'\n", - "\n", - "fileConn<-file(\"genderAPIkey.txt\")\n", - "writeLines(c(genderAPI_key), fileConn)\n", - "close(fileConn)\n", - "\n", - "names=read.csv(\"/home/jovyan/cleanedBib.csv\",stringsAsFactors=F)\n", - "setwd('/home/jovyan/')\n", - "\n", - "require(rjson)\n", - "gendFA=NULL;gendLA=NULL\n", - "gendFA_conf=NULL;gendLA_conf=NULL\n", - "\n", - "namesIncompleteFA=NULL\n", - "namesIncompleteLA=NULL\n", - "incompleteKeys=list()\n", - "incompleteRows=list()\n", - "\n", - "for(i in 1:nrow(names)){\n", - " if (nchar(names$FA[i])<2 || grepl(\"\\\\.\", names$FA[i])){\n", - " namesIncompleteFA[i] = i+1\n", - " incompleteKeys = c(incompleteKeys, names$CitationKey[i])\n", - " incompleteRows = c(incompleteRows, i+1)\n", - " }\n", - " namesIncompleteFA = namesIncompleteFA[!is.na(namesIncompleteFA)]\n", - " \n", - " if (nchar(names$LA[i])<2 || grepl(\"\\\\.\", names$LA[i])){\n", - " namesIncompleteLA[i] = i+1\n", - " incompleteKeys = c(incompleteKeys, names$CitationKey[i])\n", - " incompleteRows = c(incompleteRows, i+1)\n", - " }\n", - " namesIncompleteLA = namesIncompleteLA[!is.na(namesIncompleteLA)]\n", - "}\n", - "\n", - "if (length(names$CitationKey[which(names$SelfCite==\"Y\")]>0)){\n", - " print(paste(\"STOP: Please remove self-citations. Then, re-run steps 2 and 3. Here are some suggestions to check for with the following citation keys in your .bib file: \"))\n", - " print(paste(names$CitationKey[which(names$SelfCite==\"Y\")]))\n", - "}\n", - "\n", - "if (length(namesIncompleteFA)>0 || length(namesIncompleteLA)>0){\n", - " print(paste(\"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. Here are some suggestions to check for with the following citation keys in your .bib file: \"))\n", - " print(paste(incompleteKeys))\n", - " print(paste(\"Only continue if you've ran steps 2 and 3, and this code for step 3 no longer returns errors. For accuracy, please revise any incomplete names in the citations of your .bib file as indicated above. For more info, see rows\", paste(unique(c(namesIncompleteFA, namesIncompleteLA))), \"of cleanedBib.csv\"))\n", - "}" + "# Do a final check on the bibliography entries\n", + "with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:\n", + " names_csv = csv.reader(csvfile)\n", + " names_db = []\n", + " for row in names_csv:\n", + " names_db.append(row)\n", + "\n", + "incomplete_name_bib_keys, self_cite_bib_keys = [[], []]\n", + "authors_full_list = []\n", + "for row in names_db[1:]: # Skip the first row, it's just headers\n", + " # Check that the authors' names have at least 2 characters and no periods\n", + " row_id, first_author, last_author, _, self_cite, bib_key = row\n", + " authors_full_list.append(first_author) # For counting the number of query calls needed\n", + " authors_full_list.append(last_author)\n", + " if len(first_author)<2 or len(last_author)<2 or '.' in first_author+last_author:\n", + " incomplete_name_bib_keys.append(bib_key)\n", + " if self_cite == 'Y':\n", + " self_cite_bib_keys.append(bib_key)\n", + " \n", + "if len(self_cite_bib_keys) > 0:\n", + " warning_message = \"STOP: Please remove self-citations. Then, re-run steps 2 and 3. \"\n", + " warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n", + " print(warning_message)\n", + " print(self_cite_bib_keys)\n", + "\n", + "\n", + "if len(incomplete_name_bib_keys) > 0:\n", + " warning_message = \"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. \"\n", + " warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n", + " print(warning_message)\n", + " print(incomplete_name_bib_keys)\n", + "\n", + "final_warning_message = \"Only continue if you've ran steps 2 and 3,\"\n", + "final_warning_message += \" and this code for step 3 no longer returns errors.\"\n", + "print(final_warning_message)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "R" + }, + "outputs": [], + "source": [ + "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n", + "\n", + "# TODO: Remove in the PR that gets rid of argparse. \n", + "# The following saves the api key to a txt file just to be reloaded by the next cell\n", + "with open(\"genderAPIkey.txt\", 'w') as f:\n", + " f.write(genderAPI_key)\n", + "\n", + "# Check your credit balance\n", + "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n", + "response = urlopen(url)\n", + "decoded = response.read().decode('utf-8')\n", + "decoded_json = json.loads(decoded)\n", + "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n", + "print('This should use (at most) %d credits, '%len(np.unique(authors_full_list)) + \\\n", + " 'saving you approx %d'%(len(authors_full_list)-len(np.unique(authors_full_list))) + \\\n", + " ' credits if results are stored.')" ] }, { @@ -718,9 +735,9 @@ "displayed": true, "height": 0 }, - "version": "0.21.7" + "version": "0.20.1" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} From 09388d02b65c75987e9dd76cd1a566ecde38889f Mon Sep 17 00:00:00 2001 From: Jennifer Stiso Date: Mon, 13 Jun 2022 13:51:22 -0400 Subject: [PATCH 13/47] adding bibcheck to pipeline, defining API functions --- cleanBib.ipynb | 70 ++---- tests/aux/pipeline.py | 3 +- tests/erroneous/pipeline.py | 3 +- tests/immaculate/cleanedBib.csv | 28 +-- tests/immaculate/pipeline.py | 3 +- .../__pycache__/preprocessing.cpython-39.pyc | Bin 13808 -> 14879 bytes utils/__pycache__/queries.cpython-39.pyc | Bin 838 -> 1893 bytes utils/preprocessing.py | 36 ++- utils/queries.py | 222 ++++++++++++++++++ 9 files changed, 295 insertions(+), 70 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 01e4ded..3decec8 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -148,10 +148,18 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": { - "kernel": "SoS" + "kernel": "R" }, + "outputs": [], + "source": [ + "bib_check(homedir)" + ] + }, + { + "cell_type": "markdown", "source": [ "## 3. Estimate gender and race of authors from cleaned bibliography\n", "\n", @@ -168,61 +176,19 @@ "\n", "![save button](img/saveBib.png)\n", "\n", - "Common issues include: \n", + "Common issues include:\n", "\n", - "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\" \n", - "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person. \n", - "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown. \n", + "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n", + "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n", + "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown.\n", "\n", "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n", "\n", "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + ], "metadata": { - "kernel": "R" - }, - "outputs": [], - "source": [ - "# Do a final check on the bibliography entries\n", - "with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile:\n", - " names_csv = csv.reader(csvfile)\n", - " names_db = []\n", - " for row in names_csv:\n", - " names_db.append(row)\n", - "\n", - "incomplete_name_bib_keys, self_cite_bib_keys = [[], []]\n", - "authors_full_list = []\n", - "for row in names_db[1:]: # Skip the first row, it's just headers\n", - " # Check that the authors' names have at least 2 characters and no periods\n", - " row_id, first_author, last_author, _, self_cite, bib_key = row\n", - " authors_full_list.append(first_author) # For counting the number of query calls needed\n", - " authors_full_list.append(last_author)\n", - " if len(first_author)<2 or len(last_author)<2 or '.' in first_author+last_author:\n", - " incomplete_name_bib_keys.append(bib_key)\n", - " if self_cite == 'Y':\n", - " self_cite_bib_keys.append(bib_key)\n", - " \n", - "if len(self_cite_bib_keys) > 0:\n", - " warning_message = \"STOP: Please remove self-citations. Then, re-run steps 2 and 3. \"\n", - " warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n", - " print(warning_message)\n", - " print(self_cite_bib_keys)\n", - "\n", - "\n", - "if len(incomplete_name_bib_keys) > 0:\n", - " warning_message = \"STOP: Please revise incomplete full first names or empty cells. Then, re-run steps 2 and 3. \"\n", - " warning_message += \"Here are some suggestions to check for with the following citation keys in your .bib file: \"\n", - " print(warning_message)\n", - " print(incomplete_name_bib_keys)\n", - "\n", - "final_warning_message = \"Only continue if you've ran steps 2 and 3,\"\n", - "final_warning_message += \" and this code for step 3 no longer returns errors.\"\n", - "print(final_warning_message)" - ] + "collapsed": false + } }, { "cell_type": "code", @@ -740,4 +706,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py index 4bd4640..c524166 100644 --- a/tests/aux/pipeline.py +++ b/tests/aux/pipeline.py @@ -34,4 +34,5 @@ # find and print duplicates bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites - get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) +bib_check(homedir) \ No newline at end of file diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py index 4bd4640..c524166 100644 --- a/tests/erroneous/pipeline.py +++ b/tests/erroneous/pipeline.py @@ -34,4 +34,5 @@ # find and print duplicates bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites - get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) +bib_check(homedir) \ No newline at end of file diff --git a/tests/immaculate/cleanedBib.csv b/tests/immaculate/cleanedBib.csv index 7f322e7..9b5f43b 100644 --- a/tests/immaculate/cleanedBib.csv +++ b/tests/immaculate/cleanedBib.csv @@ -1,15 +1,15 @@ Article,FA,LA,Title,SelfCite,CitationKey -2,Danielle,Perry,,N,bassett2022curious -3,Gyorgy,Edvard,,N,buzsaki2013memory -4,Paula,Rachel,,N,chatterjee2021gender -5,Andrew,Michelle,,N,ethnicolr2022asian -6,Denzel,Ketanji,,N,ethnicolr2022black -7,Rafael,Alexandria,,N,ethnicolr2022hispanic -8,Nicolas,Meryl,,N,ethnicolr2022white -9,Danielle,Jennifer,,Y,fake2022 -10,Jacqueline,Bradley,,N,fulvio2021imbalance -11,,JH,,N,jurafsky2018n -12,,Dina,,N,Lundine2019 -13,Sara,Holly,,N,mitchell2013gendered -14,William,William,,N,moralia2005 -15,Perry,Danielle,,N,zurn2020network +2,Gyorgy,Edvard,,N,buzsaki2013memory +3,,Dina,,N,Lundine2019 +4,Perry,Danielle,,N,zurn2020network +5,William,William,,N,moralia2005 +6,Danielle,Perry,,N,bassett2022curious +7,Danielle,Jennifer,,Y,fake2022 +8,,JH,,N,jurafsky2018n +9,Sara,Holly,,N,mitchell2013gendered +10,Paula,Rachel,,N,chatterjee2021gender +11,Jacqueline,Bradley,,N,fulvio2021imbalance +12,Denzel,Ketanji,,N,ethnicolr2022black +13,Rafael,Alexandria,,N,ethnicolr2022hispanic +14,Andrew,Michelle,,N,ethnicolr2022asian +15,Nicolas,Meryl,,N,ethnicolr2022white diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py index 4bd4640..c524166 100644 --- a/tests/immaculate/pipeline.py +++ b/tests/immaculate/pipeline.py @@ -34,4 +34,5 @@ # find and print duplicates bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites - get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) \ No newline at end of file + get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) +bib_check(homedir) \ No newline at end of file diff --git a/utils/__pycache__/preprocessing.cpython-39.pyc b/utils/__pycache__/preprocessing.cpython-39.pyc index b95c194989df590caedee869c0a591ccecbeda2f..b68df308c35f5ef863048c99f4094963f4a58c53 100644 GIT binary patch delta 1144 zcmZuvO=}ZD7@pZ|lHF~-EPjH@s6|cHpjAdox$ zsio3h{B!Bpdwgi=N40o>9(;n_eH2fl>evSde>~l633I16#bG_F9iZ9{b9+=l027ou_l+^4y=`4S70=95xf zqSiT-l#~^h2D*r)DQU4R6j)={sJ$f<7#L9}DaU0x0$IusT~2)n)Gkbk5JVM6HR$&d z7d5=?Yt&D$I5|CY?fN9S9x`7tB3LJyV??sBaiSh5Uj-4DH8Rs;{5Tw*5Is($VqHll zYGm>5WhNN$Valij&}%lCOhbr@NWI1CZPJJYnGaNps1}1w7)J8}Z}!PXcG6}Gk_4PA zL_I-jv%xHB1RxCh3g;a$59VQ7>Vx6v9AaxdoTXmf_ zJ$8+U3s67eD&RfH*wCeqz^#S<_j1P)`c3BUXaw93y-jYhL*FBXonQ@Z9iKk@5dJQC9(>5yjvZ`b z3y&KGSSv;aJEy=k2Gha?qm&+6*!s`Xbny;?t61F7%l_qWXTIZi%dgHJ#uLk*&z?4{ F=|96|Je&Xk delta 73 zcmbPV@*$fqk(ZZ?0SG3aZBAnIn8+u?cx$5e21dn+cQjcUgBdiHHVZNCH0IP~ED8jw c(v+S2#o`>J=ww4);mNlx`4|l*zp^w201o{XHvj+t diff --git a/utils/__pycache__/queries.cpython-39.pyc b/utils/__pycache__/queries.cpython-39.pyc index 8e5b128322dd86be564bcfd5e2eee6253afb9213..01db4841ccd562f77608c750a828f537a8c8f6b8 100644 GIT binary patch delta 1124 zcmbVK!EVz)5ZzrncH*X?M5qX;=n7FF0wo2(0VzUMEdnHHD_T@x2}RrNCUN4}>Doz1 zWh;mFiU9Eg^vIDf;KrHra)m1wet@x4phe=sS~ELu-i&v4XFeI<3I#SjZ6R16zwd9~ zG~X2F&Yl9q5y$(;Mx1aBcFgq+RLXp)ej(#JPQnjY9`-EAGDxrrOo-Z;;|R^BsV_;4 z9Tbz0ywf7w*4vo_yn>#8jsYb&(mLb}Y8$Z*5hmApW*EYM?KuCqA(XBK6M)LJjnDOc}3%?(0Vro@vyh^2O36Oh!=FEqdl;_;&t>I2X_|{ zPQPcMc>x~%#BSHhHGE0?UdQu?-k3Ip$A!E`8;~HVXumEEIv28~J^30^^B+ zkhDVU%)1;?+4pGJasqlKY_TxO6%evzVyng_qKs;(1+b-&xYa~1Fp5VSkr9H+OhC08z2UBAxU;0_qSLjS9n zWP&FrU-)e$waKIqxd7@~Vq5~H#`v+qPZi#BzM4?b$;nGidXiSPtCxEnSCQfNYH3=L z9=Ed+-l{!AGI=JrKklcoDA!=z**rMbdWy59qB^qhLf6-wv* zZqJ^1@bLbFdmH1;+WM2V&9&vVN2RV!Z diff --git a/utils/preprocessing.py b/utils/preprocessing.py index b995478..58b210b 100644 --- a/utils/preprocessing.py +++ b/utils/preprocessing.py @@ -471,6 +471,40 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor return selfCite - +def bib_check(homedir): + # Do a final check on the bibliography entries + with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile: + names_csv = csv.reader(csvfile) + names_db = [] + for row in names_csv: + names_db.append(row) + + incomplete_name_bib_keys, self_cite_bib_keys = [[], []] + authors_full_list = [] + for row in names_db[1:]: # Skip the first row, it's just headers + # Check that the authors' names have at least 2 characters and no periods + row_id, first_author, last_author, _, self_cite, bib_key = row + authors_full_list.append(first_author) # For counting the number of query calls needed + authors_full_list.append(last_author) + if len(first_author) < 2 or len(last_author) < 2 or '.' in first_author + last_author: + incomplete_name_bib_keys.append(bib_key) + if self_cite == 'Y': + self_cite_bib_keys.append(bib_key) + + if len(self_cite_bib_keys) > 0: + warning_message = "STOP: Please remove self-citations. Then, re-run step 2. " + warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: " + print(warning_message) + print(self_cite_bib_keys) + + if len(incomplete_name_bib_keys) > 0: + warning_message = "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. " + warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: " + print(warning_message) + print(incomplete_name_bib_keys) + + final_warning_message = "Only continue if you've run steps 2," + final_warning_message += " and this code no longer returns errors." + print(final_warning_message) diff --git a/utils/queries.py b/utils/queries.py index 7cac1ac..6985371 100644 --- a/utils/queries.py +++ b/utils/queries.py @@ -29,4 +29,226 @@ def namesFromXref(cr, doi, title, authorPos): return name +def gender_base(homedir): + """ + for unknown gender, fill with base rates + you will never / can't run this (that file is too big to share) + """ + main_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0) + + + gender_base = {} + for year in np.unique(main_df.PY.values): + ydf = main_df[main_df.PY==year].AG + fa = np.array([x[0] for x in ydf.values]) + la = np.array([x[1] for x in ydf.values]) + + fa_m = len(fa[fa=='M'])/ len(fa[fa!='U']) + fa_w = len(fa[fa=='W'])/ len(fa[fa!='U']) + + la_m = len(la[fa=='M'])/ len(la[la!='U']) + la_w = len(la[fa=='W'])/ len(la[la!='U']) + + gender_base[year] = [fa_m,fa_w,la_m,la_w] + + gender_base[2020] = [fa_m,fa_w,la_m,la_w] + + with open(homedir + '/data/gender_base' + '.pkl', 'wb') as f: + pickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL) + + +def get_pred_demos(authors): + """ + + :param authors: + :return: + """ + authors = authors.split(' ') + print('first author is %s %s ' % (authors[1], authors[0])) + print('last author is %s %s ' % (authors[3], authors[2])) + print("we don't count these, but check the predictions file to ensure your names did not slip through!") + + citation_matrix = np.zeros((8, 8)) + + print('looping through your references, predicting gender and race') + + columns = ['CitationKey', 'Author', 'Gender', 'W', 'A', 'GendCat'] + paper_df = pd.DataFrame(columns=columns) + + gender = [] + race = [] + + idx = 0 + for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)): + if 'author' not in bibfile.entries[paper].persons.keys(): + continue # some editorials have no authors + if 'year' not in bibfile.entries[paper].fields.keys(): + year = 2020 + else: + year = int(bibfile.entries[paper].fields['year']) + + if year not in gender_base.keys(): + gb = gender_base[1995] + else: + gb = gender_base[year] + + fa = bibfile.entries[paper].persons['author'][0] + try: + fa_fname = fa.first_names[0] + except: + fa_fname = fa.last_names[0] # for people like Plato + fa_lname = fa.last_names[0] + + la = bibfile.entries[paper].persons['author'][-1] + try: + la_fname = la.first_names[0] + except: + la_fname = la.last_names[0] # for people like Plato + la_lname = la.last_names[0] + + if fa_fname.lower().strip() == authors[1].lower().strip(): + if fa_lname.lower().strip() == authors[0].lower().strip(): + continue + + if fa_fname.lower().strip() == authors[3].lower().strip(): + if fa_lname.lower().strip() == authors[2].lower().strip(): + continue + + if la_fname.lower().strip() == authors[1].lower().strip(): + if la_lname.lower().strip() == authors[0].lower().strip(): + continue + + if la_fname.lower().strip() == authors[3].lower().strip(): + if la_lname.lower().strip() == authors[2].lower().strip(): + continue + + fa_fname = convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + fa_lname = convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + la_fname = convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + la_lname = convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + + names = [{'lname': fa_lname, 'fname': fa_fname}] + fa_df = pd.DataFrame(names, columns=['fname', 'lname']) + asian, hispanic, black, white = pred_fl_reg_name(fa_df, 'lname', 'fname').values[0][-4:] + fa_race = [white, asian, hispanic, black] + + names = [{'lname': la_lname, 'fname': la_fname}] + la_df = pd.DataFrame(names, columns=['fname', 'lname']) + asian, hispanic, black, white = pred_fl_reg_name(la_df, 'lname', 'fname').values[0][-4:] + la_race = [white, asian, hispanic, black] + + url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(fa_fname)) + response = urlopen(url) + decoded = response.read().decode('utf-8') + fa_gender = json.loads(decoded) + if fa_gender['gender'] == 'female': + fa_g = [0, fa_gender['accuracy'] / 100.] + if fa_gender['gender'] == 'male': + fa_g = [fa_gender['accuracy'] / 100., 0] + if fa_gender['gender'] == 'unknown': + fa_g = gb[:2] + + url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(la_fname)) + response = urlopen(url) + decoded = response.read().decode('utf-8') + la_gender = json.loads(decoded) + if la_gender['gender'] == 'female': + la_g = [0, la_gender['accuracy'] / 100.] + + if la_gender['gender'] == 'male': + la_g = [la_gender['accuracy'] / 100., 0] + + if la_gender['gender'] == 'unknown': + la_g = gb[2:] + + fa_data = np.array( + [paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['accuracy']), fa_race[0], + np.sum(fa_race[1:]), '']).reshape(1, 6) + paper_df = paper_df.append(pd.DataFrame(fa_data, columns=columns), ignore_index=True) + la_data = np.array( + [paper, '%s,%s' % (la_fname, la_lname), '%s,%s' % (la_gender['gender'], la_gender['accuracy']), la_race[0], + np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1, 6) + paper_df = paper_df.append(pd.DataFrame(la_data, columns=columns), ignore_index=True) + + mm = fa_g[0] * la_g[0] + wm = fa_g[1] * la_g[0] + mw = fa_g[0] * la_g[1] + ww = fa_g[1] * la_g[1] + mm, wm, mw, ww = [mm, wm, mw, ww] / np.sum([mm, wm, mw, ww]) + + gender.append([mm, wm, mw, ww]) + + ww = fa_race[0] * la_race[0] + aw = np.sum(fa_race[1:]) * la_race[0] + wa = fa_race[0] * np.sum(la_race[1:]) + aa = np.sum(fa_race[1:]) * np.sum(la_race[1:]) + + race.append([ww, aw, wa, aa]) + + paper_matrix = np.zeros((2, 8)) + paper_matrix[0] = np.outer(fa_g, fa_race).flatten() + paper_matrix[1] = np.outer(la_g, la_race).flatten() + + paper_matrix = np.outer(paper_matrix[0], paper_matrix[1]) + + citation_matrix = citation_matrix + paper_matrix + idx = idx + 1 + + mm, wm, mw, ww = np.mean(gender, axis=0) * 100 + WW, aw, wa, aa = np.mean(race, axis=0) * 100 + + return mm, wm, mw, ww, WW, aw, wa,aa + +def print_statements(mm, wm, mw, ww, WW, aw, wa,aa): + statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \ + are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \ + diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \ + and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \ + (and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \ + MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \ + databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \ + Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \ + first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \ + color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \ + is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \ + it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \ + We look forward to future work that could help us to better understand how to support equitable practices in science." + + statement = statement.replace('MM', str(np.around(mm, 2))) + statement = statement.replace('WM', str(np.around(wm, 2))) + statement = statement.replace('MW', str(np.around(mw, 2))) + statement = statement.replace('ww', str(np.around(ww, 2))) + statement = statement.replace('WW', str(np.around(WW, 2))) + statement = statement.replace('AW', str(np.around(aw, 2))) + statement = statement.replace('WA', str(np.around(wa, 2))) + statement = statement.replace('AA', str(np.around(aa, 2))) + + statementLatex = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \ + are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\ + diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \ + and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \ + (and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), \ + MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \ + databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \ + Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \ + first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of \ + color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method \ + is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \ + it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \ + We look forward to future work that could help us to better understand how to support equitable practices in science." + + statementLatex = statementLatex.replace('MM', str(np.around(mm, 2))) + statementLatex = statementLatex.replace('WM', str(np.around(wm, 2))) + statementLatex = statementLatex.replace('MW', str(np.around(mw, 2))) + statementLatex = statementLatex.replace('ww', str(np.around(ww, 2))) + statementLatex = statementLatex.replace('WW', str(np.around(WW, 2))) + statementLatex = statementLatex.replace('AW', str(np.around(aw, 2))) + statementLatex = statementLatex.replace('WA', str(np.around(wa, 2))) + statementLatex = statementLatex.replace('AA', str(np.around(aa, 2))) + + return statement, statementLatex From 123d5c4833f51a82781497b924f6e02eb1301743 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 14:40:50 -0400 Subject: [PATCH 14/47] ipython req --- requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index ff17b13..63ded73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,9 +34,6 @@ habanero==1.2.0 idna==3.3 importlib-metadata==4.11.3 ipykernel==6.13.0 -ipython==8.3.0 -ipython-genutils==0.2.0 -ipywidgets==7.7.0 jedi==0.18.1 Jinja2==3.1.2 jsonschema==4.4.0 From 85ecff6d2d76ae74a552cbf25efe5e336749b59b Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 14:45:28 -0400 Subject: [PATCH 15/47] pandas req --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 63ded73..4a59258 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,7 +62,7 @@ numpy==1.19.5 oauthlib==3.2.0 opt-einsum==3.3.0 packaging==21.3 -pandas==1.4.2 +pandas==1.3.5 pandocfilters==1.5.0 parso==0.8.3 pexpect==4.8.0 From c1d2b4f79a198cdc194754ee826cee12abbf6ec1 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 14:47:16 -0400 Subject: [PATCH 16/47] scipy req --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4a59258..57eb29f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -91,7 +91,7 @@ QtPy==2.1.0 requests==2.27.1 requests-oauthlib==1.3.1 rsa==4.8 -scipy==1.8.0 +scipy==1.7.3 seaborn==0.11.2 Send2Trash==1.8.0 six==1.15.0 From 2c33711473c02c81c0fa1126c52b509978675b9d Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 14:59:35 -0400 Subject: [PATCH 17/47] update versions --- requirements.txt | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 57eb29f..380d35f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ cffi==1.15.0 charset-normalizer==2.0.12 cycler==0.11.0 debugpy==1.6.0 -decorator==5.1.1 +decorator==5.0 defusedxml==0.7.1 entrypoints==0.4 ethnicolr==0.8.1 @@ -23,19 +23,22 @@ executing==0.8.3 fastjsonschema==2.15.3 flatbuffers==1.12 fonttools==4.33.3 +folium==0.2.1 future==0.18.2 gast==0.4.0 -google-auth==2.6.6 +google-auth==2.0 google-auth-oauthlib==0.4.6 google-pasta==0.2.0 grpcio==1.34.1 h5py==3.1.0 habanero==1.2.0 idna==3.3 +imgaug==0.2.7 importlib-metadata==4.11.3 -ipykernel==6.13.0 +ipykernel==4.10 +ipython==5.5.0 jedi==0.18.1 -Jinja2==3.1.2 +Jinja2==3.0 jsonschema==4.4.0 jupyter==1.0.0 jupyter-client==7.3.0 @@ -57,8 +60,8 @@ nbclient==0.6.0 nbconvert==6.5.0 nbformat==5.3.0 nest-asyncio==1.5.5 -notebook==6.4.11 -numpy==1.19.5 +notebook==5.3.0 +numpy==1.21 oauthlib==3.2.0 opt-einsum==3.3.0 packaging==21.3 @@ -88,7 +91,7 @@ PyYAML==6.0 pyzmq==22.3.0 qtconsole==5.3.0 QtPy==2.1.0 -requests==2.27.1 +requests==2.23.0 requests-oauthlib==1.3.1 rsa==4.8 scipy==1.7.3 @@ -105,14 +108,14 @@ tensorflow-estimator==2.5.0 termcolor==1.1.0 terminado==0.13.3 tinycss2==1.1.1 -tornado==6.1 +tornado==5.1.0 tqdm==4.64.0 traitlets==5.1.1 typing-extensions==3.7.4.3 urllib3==1.26.9 wcwidth==0.2.5 webencodings==0.5.1 -Werkzeug==2.1.2 +Werkzeug==2.0 widgetsnbextension==3.6.0 wrapt==1.12.1 zipp==3.8.0 From 100cc9d12b2ff5fbfe4f38f0876abe299d16a9fa Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 15:00:49 -0400 Subject: [PATCH 18/47] update versions --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 380d35f..07b2e74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ cffi==1.15.0 charset-normalizer==2.0.12 cycler==0.11.0 debugpy==1.6.0 -decorator==5.0 +decorator==4.4.2 defusedxml==0.7.1 entrypoints==0.4 ethnicolr==0.8.1 From f8a396491f24bdfca9a71d745259862b279f5db8 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 15:04:49 -0400 Subject: [PATCH 19/47] update versions --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 07b2e74..e51a72a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ backcall==0.2.0 beautifulsoup4==4.11.1 bibtexparser==1.2.0 bleach==5.0.0 -cachetools==5.0.0 +cachetools==4.2.4 certifi==2021.10.8 cffi==1.15.0 charset-normalizer==2.0.12 From 91bb1eac9a3e96234ecf75c11b74f6af6795f77d Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 15 Jul 2022 15:15:33 -0400 Subject: [PATCH 20/47] update versions --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e51a72a..1b88742 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,19 +26,19 @@ fonttools==4.33.3 folium==0.2.1 future==0.18.2 gast==0.4.0 -google-auth==2.0 +google-auth==1.35.0 google-auth-oauthlib==0.4.6 google-pasta==0.2.0 grpcio==1.34.1 h5py==3.1.0 habanero==1.2.0 idna==3.3 -imgaug==0.2.7 +imgaug==0.2.6 importlib-metadata==4.11.3 ipykernel==4.10 ipython==5.5.0 jedi==0.18.1 -Jinja2==3.0 +Jinja2==2.11.3 jsonschema==4.4.0 jupyter==1.0.0 jupyter-client==7.3.0 @@ -115,7 +115,7 @@ typing-extensions==3.7.4.3 urllib3==1.26.9 wcwidth==0.2.5 webencodings==0.5.1 -Werkzeug==2.0 +Werkzeug==1.0.1 widgetsnbextension==3.6.0 wrapt==1.12.1 zipp==3.8.0 From 476a5d89771749cc695e3b17e6016cd13482e4de Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 22 Jul 2022 14:39:42 -0400 Subject: [PATCH 21/47] debugging pipeline for clean data --- .../immaculate/data/expected_matrix_florida.npy | Bin 0 -> 640 bytes tests/immaculate/data/expected_matrix_wiki.npy | Bin 0 -> 640 bytes .../data/expected_small_matrix_florida.npy | Bin 0 -> 160 bytes tests/immaculate/data/gender_base.pkl | Bin 0 -> 1621 bytes tests/immaculate/pipeline.py | 16 +++++++++++++++- 5 files changed, 15 insertions(+), 1 deletion(-) create mode 100755 tests/immaculate/data/expected_matrix_florida.npy create mode 100644 tests/immaculate/data/expected_matrix_wiki.npy create mode 100755 tests/immaculate/data/expected_small_matrix_florida.npy create mode 100644 tests/immaculate/data/gender_base.pkl diff --git a/tests/immaculate/data/expected_matrix_florida.npy b/tests/immaculate/data/expected_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9+2CMwYNLVs|Ah{h zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$- zO>R`qa_|fMHTf8SzQeHRRqpUL*W~-t>AnuzWUb!b zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy= zut4FxE^F3g2fy7MZzeG=bbv$DHJk;6Z2PmfCsl@3=ArmSggnD4On zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%< zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{ z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5 o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^ literal 0 HcmV?d00001 diff --git a/tests/immaculate/data/expected_matrix_wiki.npy b/tests/immaculate/data/expected_matrix_wiki.npy new file mode 100644 index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T@=p388%*sy`wf{MKkXwr&h{khpCdGrcg! zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4 zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nEIxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK literal 0 HcmV?d00001 diff --git a/tests/immaculate/data/expected_small_matrix_florida.npy b/tests/immaculate/data/expected_small_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae GIT binary patch literal 160 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ F006H#DmwrG literal 0 HcmV?d00001 diff --git a/tests/immaculate/data/gender_base.pkl b/tests/immaculate/data/gender_base.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44 GIT binary patch literal 1621 zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B zAvb*)iAYoj%<$w_g#jT)NFpL`z!NwcG}4eAzj6okvG|TOV0w$B zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV= zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9OLws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^ z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J literal 0 HcmV?d00001 diff --git a/tests/immaculate/pipeline.py b/tests/immaculate/pipeline.py index c524166..83c1865 100644 --- a/tests/immaculate/pipeline.py +++ b/tests/immaculate/pipeline.py @@ -6,6 +6,8 @@ wd = Path(os.getcwd()) sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils') from preprocessing import * +from ethnicolr import pred_fl_reg_name +import tensorflow as tf cr = Crossref() #homedir = '/home/jovyan/' @@ -35,4 +37,16 @@ bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) -bib_check(homedir) \ No newline at end of file +bib_check(homedir) + +# queries +try: + f = open("genderAPIkey.txt", "r") + genderAPI_key = f.readline().replace('\n', '') +except: + genderAPI_key = input("Enter genderAPI key:") +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key) +statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa) +print(statement) \ No newline at end of file From de95a8e20ff1e3caf7c1912f61184a5677c1802b Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 22 Jul 2022 15:31:11 -0400 Subject: [PATCH 22/47] removing duplicate queries --- cleanBib.ipynb | 253 +----------------- tests/aux/data/expected_matrix_florida.npy | Bin 0 -> 640 bytes tests/aux/data/expected_matrix_wiki.npy | Bin 0 -> 640 bytes .../data/expected_small_matrix_florida.npy | Bin 0 -> 160 bytes tests/aux/data/gender_base.pkl | Bin 0 -> 1621 bytes .../data/expected_matrix_florida.npy | Bin 0 -> 640 bytes tests/erroneous/data/expected_matrix_wiki.npy | Bin 0 -> 640 bytes .../data/expected_small_matrix_florida.npy | Bin 0 -> 160 bytes tests/erroneous/data/gender_base.pkl | Bin 0 -> 1621 bytes .../__pycache__/preprocessing.cpython-310.pyc | Bin 0 -> 14445 bytes utils/__pycache__/queries.cpython-310.pyc | Bin 0 -> 10186 bytes utils/queries.py | 151 ++++++----- 12 files changed, 92 insertions(+), 312 deletions(-) create mode 100755 tests/aux/data/expected_matrix_florida.npy create mode 100644 tests/aux/data/expected_matrix_wiki.npy create mode 100755 tests/aux/data/expected_small_matrix_florida.npy create mode 100644 tests/aux/data/gender_base.pkl create mode 100755 tests/erroneous/data/expected_matrix_florida.npy create mode 100644 tests/erroneous/data/expected_matrix_wiki.npy create mode 100755 tests/erroneous/data/expected_small_matrix_florida.npy create mode 100644 tests/erroneous/data/gender_base.pkl create mode 100644 utils/__pycache__/preprocessing.cpython-310.pyc create mode 100644 utils/__pycache__/queries.cpython-310.pyc diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 3decec8..bcafb12 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -49,30 +49,16 @@ }, "outputs": [], "source": [ - "import numpy as np\n", - "import bibtexparser\n", - "from bibtexparser.bparser import BibTexParser\n", "import glob\n", - "import subprocess\n", - "import os\n", - "from pybtex.database.input import bibtex\n", - "import csv\n", - "from pylatexenc.latex2text import LatexNodes2Text \n", - "import unicodedata\n", - "import re\n", - "import pandas as pd\n", "from habanero import Crossref\n", - "import string\n", - "from time import sleep\n", - "import tqdm\n", - "import matplotlib.pylab as plt\n", - "import matplotlib.gridspec as gridspec\n", - "import json\n", - "import pickle\n", - "from urllib.request import urlopen\n", - "from urllib.parse import quote\n", - "from ethnicolr import census_ln, pred_census_ln,pred_wiki_name\n", - "from pybtex.database import parse_file\n", + "import sys\n", + "import os\n", + "from pathlib import Path\n", + "wd = Path(os.getcwd())\n", + "sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')\n", + "from preprocessing import *\n", + "from ethnicolr import pred_fl_reg_name\n", + "import tensorflow as tf\n", "import seaborn as sns\n", "\n", "cr = Crossref()\n", @@ -243,226 +229,9 @@ "\n", "import tensorflow as tf\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", - "import argparse\n", - "parser = argparse.ArgumentParser()\n", - "parser.add_argument('-bibfile',action='store',dest='bibfile',default=' '.join(bib_files))\n", - "parser.add_argument('-homedir',action='store',dest='homedir',default='/home/jovyan/')\n", - "parser.add_argument('-authors',action='store',dest='authors', default=(yourFirstAuthor+' '+yourLastAuthor).replace(',',''))\n", - "parser.add_argument('-method',action='store',dest='method',default='florida')\n", - "parser.add_argument('-font',action='store',dest='font',default='Palatino') # hey, we all have our favorite\n", - "parser.add_argument('-gender_key',action='store',dest='gender_key',default=genderAPI_key)\n", - "r = parser.parse_args()\n", - "locals().update(r.__dict__)\n", - "bibfile = bib_data\n", - "\n", - "\n", - "def gender_base():\n", - "\t\"\"\"\n", - "\tfor unknown gender, fill with base rates\n", - "\tyou will never / can't run this (that file is too big to share)\n", - "\t\"\"\"\n", - "\tmain_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0)\n", - "\n", - "\n", - "\tgender_base = {}\n", - "\tfor year in np.unique(main_df.PY.values):\n", - "\t\tydf = main_df[main_df.PY==year].AG\n", - "\t\tfa = np.array([x[0] for x in ydf.values])\n", - "\t\tla = np.array([x[1] for x in ydf.values])\n", - "\n", - "\t\tfa_m = len(fa[fa=='M'])/ len(fa[fa!='U'])\n", - "\t\tfa_w = len(fa[fa=='W'])/ len(fa[fa!='U'])\n", - "\n", - "\t\tla_m = len(la[fa=='M'])/ len(la[la!='U'])\n", - "\t\tla_w = len(la[fa=='W'])/ len(la[la!='U'])\n", - "\n", - "\t\tgender_base[year] = [fa_m,fa_w,la_m,la_w]\n", - "\n", - "\tgender_base[2020] = [fa_m,fa_w,la_m,la_w]\n", - "\n", - "\twith open(homedir + '/data/gender_base' + '.pkl', 'wb') as f:\n", - "\t\tpickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL)\n", - "\n", - "\n", - "with open(homedir + 'data/gender_base' + '.pkl', 'rb') as f:\n", - "\tgender_base = pickle.load(f)\n", - "\n", - "authors = authors.split(' ')\n", - "print ('first author is %s %s '%(authors[1],authors[0]))\n", - "print ('last author is %s %s '%(authors[3],authors[2]))\n", - "print (\"we don't count these, but check the predictions file to ensure your names did not slip through!\")\n", - "\n", - "citation_matrix = np.zeros((8,8))\n", - "matrix_idxs = {'white_m':0,'api_m':1,'hispanic_m':2,'black_m':3,'white_f':4,'api_f':5,'hispanic_f':6,'black_f':7}\n", - "\n", - "asian = [0,1,2]\n", - "black = [3,4]\n", - "white = [5,6,7,8,9,11,12]\n", - "hispanic = [10]\n", - "print ('looping through your references, predicting gender and race')\n", - "\n", - "columns=['CitationKey','Author','Gender','W','A', 'GendCat']\n", - "paper_df = pd.DataFrame(columns=columns)\n", - "\n", - "gender = []\n", - "race = []\n", - "\n", - "\n", - "idx = 0\n", - "for paper in tqdm.tqdm(bibfile.entries,total=len(bibfile.entries)): \n", - "\tif 'author' not in bibfile.entries[paper].persons.keys():\n", - "\t\tcontinue #some editorials have no authors\n", - "\tif 'year' not in bibfile.entries[paper].fields.keys():\n", - "\t\tyear = 2020\n", - "\telse: year = int(bibfile.entries[paper].fields['year']) \n", - "\t\n", - "\tif year not in gender_base.keys():\n", - "\t\tgb = gender_base[1995]\n", - "\telse:\n", - "\t\tgb = gender_base[year]\n", - "\n", - "\tfa = bibfile.entries[paper].persons['author'][0]\n", - "\ttry:fa_fname = fa.first_names[0] \n", - "\texcept:fa_fname = fa.last_names[0] #for people like Plato\n", - "\tfa_lname = fa.last_names[0] \n", - "\n", - "\tla = bibfile.entries[paper].persons['author'][-1]\n", - "\ttry:la_fname = la.first_names[0] \n", - "\texcept:la_fname = la.last_names[0] #for people like Plato\n", - "\tla_lname = la.last_names[0]\n", - "\n", - "\tif fa_fname.lower().strip() == authors[1].lower().strip():\n", - "\t\tif fa_lname.lower().strip() == authors[0].lower().strip() :\n", - "\t\t\tcontinue\n", - "\n", - "\tif fa_fname.lower().strip() == authors[3].lower().strip() :\n", - "\t\tif fa_lname.lower().strip() == authors[2].lower().strip() :\n", - "\t\t\tcontinue\n", - "\n", - "\tif la_fname.lower().strip() == authors[1].lower().strip() :\n", - "\t\tif la_lname.lower().strip() == authors[0].lower().strip() :\n", - "\t\t\tcontinue\n", - "\t\n", - "\tif la_fname.lower().strip() == authors[3].lower().strip() :\n", - "\t\tif la_lname.lower().strip() == authors[2].lower().strip() :\n", - "\t\t\tcontinue\n", - "\n", - "\tfa_fname = convertLatexSpecialChars(str(fa_fname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n", - "\tfa_lname = convertLatexSpecialChars(str(fa_lname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n", - "\tla_fname = convertLatexSpecialChars(str(la_fname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'') \n", - "\tla_lname = convertLatexSpecialChars(str(la_lname.encode(\"ascii\", errors=\"ignore\").decode())).translate(str.maketrans('', '', re.sub('\\-', '', string.punctuation))).replace('Protected',\"\").replace(\" \",'')\n", - "\n", - "\tnames = [{'lname': fa_lname,'fname':fa_fname}]\n", - "\tfa_df = pd.DataFrame(names,columns=['fname','lname'])\n", - "\tasian,hispanic,black,white = pred_fl_reg_name(fa_df,'lname','fname').values[0][-4:]\n", - "\tfa_race = [white,asian,hispanic,black]\n", - "\t\n", - "\tnames = [{'lname': la_lname,'fname':la_fname}]\n", - "\tla_df = pd.DataFrame(names,columns=['fname','lname'])\n", - "\tasian,hispanic,black,white = pred_fl_reg_name(la_df,'lname','fname').values[0][-4:]\n", - "\tla_race = [white,asian,hispanic,black]\n", - "\n", - "\turl = \"https://gender-api.com/get?key=\" + gender_key + \"&name=%s\" %(quote(fa_fname))\n", - "\tresponse = urlopen(url)\n", - "\tdecoded = response.read().decode('utf-8')\n", - "\tfa_gender = json.loads(decoded)\n", - "\tif fa_gender['gender'] == 'female':\n", - "\t\tfa_g = [0,fa_gender['accuracy']/100.]\n", - "\tif fa_gender['gender'] == 'male':\n", - "\t\tfa_g = [fa_gender['accuracy']/100.,0]\n", - "\tif fa_gender['gender'] == 'unknown':\n", - "\t\tfa_g = gb[:2]\n", - "\n", - "\turl = \"https://gender-api.com/get?key=\" + gender_key + \"&name=%s\" %(quote(la_fname))\n", - "\tresponse = urlopen(url)\n", - "\tdecoded = response.read().decode('utf-8')\n", - "\tla_gender = json.loads(decoded)\n", - "\tif la_gender['gender'] == 'female':\n", - "\t\tla_g = [0,la_gender['accuracy']/100.]\n", - "\t\n", - "\tif la_gender['gender'] == 'male':\n", - "\t\tla_g = [la_gender['accuracy']/100.,0]\n", - "\n", - "\tif la_gender['gender'] == 'unknown':\n", - "\t\tla_g = gb[2:] \n", - "\t\n", - "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:]), '']).reshape(1,6)\n", - "\tpaper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)\n", - "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1,6)\n", - "\tpaper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)\n", - "\n", - "\tmm = fa_g[0]*la_g[0]\n", - "\twm = fa_g[1]*la_g[0]\n", - "\tmw = fa_g[0]*la_g[1]\n", - "\tww = fa_g[1]*la_g[1]\n", - "\tmm,wm,mw,ww = [mm,wm,mw,ww]/np.sum([mm,wm,mw,ww])\n", - "\t\n", - "\tgender.append([mm,wm,mw,ww])\n", - "\n", - "\tww = fa_race[0] * la_race[0]\n", - "\taw = np.sum(fa_race[1:]) * la_race[0]\n", - "\twa = fa_race[0] * np.sum(la_race[1:])\n", - "\taa = np.sum(fa_race[1:]) * np.sum(la_race[1:])\n", - "\n", - "\trace.append([ww,aw,wa,aa])\n", - "\n", - "\tpaper_matrix = np.zeros((2,8))\n", - "\tpaper_matrix[0] = np.outer(fa_g,fa_race).flatten() \n", - "\tpaper_matrix[1] = np.outer(la_g,la_race).flatten() \n", - "\n", - "\tpaper_matrix = np.outer(paper_matrix[0],paper_matrix[1]) \n", - "\n", - "\tcitation_matrix = citation_matrix + paper_matrix\n", - "\tidx = idx + 1\n", - "\n", - "mm,wm,mw,ww = np.mean(gender,axis=0)*100\n", - "WW,aw,wa,aa = np.mean(race,axis=0)*100\n", - "\n", - "statement = \"Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \\\n", - "are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \\\n", - "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \\\n", - "and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \\\n", - "(and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \\\n", - "MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \\\n", - "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \\\n", - "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \\\n", - "first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \\\n", - "color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \\\n", - "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \\\n", - "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \\\n", - "We look forward to future work that could help us to better understand how to support equitable practices in science.\"\n", - "\n", - "statement = statement.replace('MM',str(np.around(mm,2)))\n", - "statement = statement.replace('WM',str(np.around(wm,2)))\n", - "statement = statement.replace('MW',str(np.around(mw,2)))\n", - "statement = statement.replace('ww',str(np.around(ww,2)))\n", - "statement = statement.replace('WW',str(np.around(WW,2)))\n", - "statement = statement.replace('AW',str(np.around(aw,2)))\n", - "statement = statement.replace('WA',str(np.around(wa,2)))\n", - "statement = statement.replace('AA',str(np.around(aa,2)))\n", - "\n", - "statementLatex = \"Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \\\n", - "are under-cited relative to the number of such papers in the field \\cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\\\n", - "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \\\n", - "and last author of each reference by using databases that store the probability of a first name being carried by a woman \\cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \\\n", - "(and excluding self-citations to the first and last authors of our current paper), our references contain ww\\% woman(first)/woman(last), \\\n", - "MW\\% man/woman, WM\\% woman/man, and MM\\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \\\n", - "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \\\n", - "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \\\n", - "first and last name being carried by an author of color \\cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\\% author of \\\n", - "color (first)/author of color(last), WA\\% white author/author of color, AW\\% author of color/white author, and WW\\% white author/white author. This method \\\n", - "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \\\n", - "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \\\n", - "We look forward to future work that could help us to better understand how to support equitable practices in science.\"\n", - "\n", - "statementLatex = statementLatex.replace('MM',str(np.around(mm,2)))\n", - "statementLatex = statementLatex.replace('WM',str(np.around(wm,2)))\n", - "statementLatex = statementLatex.replace('MW',str(np.around(mw,2)))\n", - "statementLatex = statementLatex.replace('ww',str(np.around(ww,2)))\n", - "statementLatex = statementLatex.replace('WW',str(np.around(WW,2)))\n", - "statementLatex = statementLatex.replace('AW',str(np.around(aw,2)))\n", - "statementLatex = statementLatex.replace('WA',str(np.around(wa,2)))\n", - "statementLatex = statementLatex.replace('AA',str(np.around(aa,2)))" + "\n", + "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, gender_key)\n", + "statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)" ] }, { diff --git a/tests/aux/data/expected_matrix_florida.npy b/tests/aux/data/expected_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9+2CMwYNLVs|Ah{h zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$- zO>R`qa_|fMHTf8SzQeHRRqpUL*W~-t>AnuzWUb!b zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy= zut4FxE^F3g2fy7MZzeG=bbv$DHJk;6Z2PmfCsl@3=ArmSggnD4On zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%< zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{ z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5 o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^ literal 0 HcmV?d00001 diff --git a/tests/aux/data/expected_matrix_wiki.npy b/tests/aux/data/expected_matrix_wiki.npy new file mode 100644 index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T@=p388%*sy`wf{MKkXwr&h{khpCdGrcg! zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4 zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nEIxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK literal 0 HcmV?d00001 diff --git a/tests/aux/data/expected_small_matrix_florida.npy b/tests/aux/data/expected_small_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae GIT binary patch literal 160 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ F006H#DmwrG literal 0 HcmV?d00001 diff --git a/tests/aux/data/gender_base.pkl b/tests/aux/data/gender_base.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44 GIT binary patch literal 1621 zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B zAvb*)iAYoj%<$w_g#jT)NFpL`z!NwcG}4eAzj6okvG|TOV0w$B zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV= zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9OLws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^ z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J literal 0 HcmV?d00001 diff --git a/tests/erroneous/data/expected_matrix_florida.npy b/tests/erroneous/data/expected_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..b52cbd858d721be41d5987d8b41d2071edbbf1d1 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCO0oTK)EP!E+8fkMSDLS~uO{lymtK-j9+2CMwYNLVs|Ah{h zH=WIH6RUKHIiKjiB)`+)sNM-ik$XK3^?vP}=jk*%Opl*x9`h-}!TD|NBDoJq4q`$- zO>R`qa_|fMHTf8SzQeHRRqpUL*W~-t>AnuzWUb!b zFAsN6Vf-zyc#ns}E9qFtx$^TI3MNW8_`fQ2m>6r3aJ?thVMfr+)8B5CIxM>Rj>~&p zm4n$jFH7llfevX)jGw%n<>j!&!eZ;iMF9>KPbPca-?Pu*pLgD4v8$~P!uRgI=lRy= zut4FxE^F3g2fy7MZzeG=bbv$DHJk;6Z2PmfCsl@3=ArmSggnD4On zyM^}qpl*kopN>7;u_M*tv+LWdKK*45=Q!W|zMRqMpca0ZsiU^gp`~!^-RT|P4xU%< zZCX1!%psm3=E>PP6CLjCpP~J%FV;bIe~a9e8MzKd=T}unrlvV$F|1nD_oCe4xyLK{ z)y*yrbsHq6aBhrrD9*oLcf8Qgp|w-_`xco#2el(|rKPfw4w`aJ3^Ch69Tvzf(6~H5 o#o_dxr~Q9^7dbcxed9m%#=)WEx9~eYbq9x43*X!;W%YId0KJ+J_y7O^ literal 0 HcmV?d00001 diff --git a/tests/erroneous/data/expected_matrix_wiki.npy b/tests/erroneous/data/expected_matrix_wiki.npy new file mode 100644 index 0000000000000000000000000000000000000000..cd228b65df77e62123c36bee1302543fdead02a3 GIT binary patch literal 640 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I#iItmt=ItsN4WCN}VEB3QhKDywL=r?t`ThMHWRq+=qR^FNBuw7i+WgA!18i(6IFDYy{E_OJOAMJZtakB&GrNmYV?bQxW zxrNhDeyVUVTa%kNds3mp;ffQJx;OMWXqIKo5s7Pe@NntB8xxq|u;7TULdUI0ho11d zcCW)z9Ud(^@Y%6F#o?*e#lvM5=?*dHT7EDs2y#ePl&@D>T@=p388%*sy`wf{MKkXwr&h{khpCdGrcg! zA$9)$r!(W-9lrc7u}U{|aM(BL`6NqbD~CfzZ#SMvIN;FHc5<)$^bUtN;rqR+XSX|4 zpRQ(Ut*dg_cP!$Py2?_A9~BMj|D7#yxMc98Ue}?-A#jbtk1bU>4)#|nEIxJtTvsvW0or6p3iSOAL`y816y7n1r#XDFUJw2ps5$eDfvwm908GncR l^hHwpJ{38X-;})H``*prtIac)(;1Eqod1u>3Cy#1008li1`YrK literal 0 HcmV?d00001 diff --git a/tests/erroneous/data/expected_small_matrix_florida.npy b/tests/erroneous/data/expected_small_matrix_florida.npy new file mode 100755 index 0000000000000000000000000000000000000000..34fa5b99ce9d4b1a8dcdcb2cc178bbd88f8332ae GIT binary patch literal 160 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= zXCxM+0{I$7ItoUbItsN4WCO19TK)EP!E+94_HnOREwIHw`g!|a1>yY;d}T+^AL-cQ F006H#DmwrG literal 0 HcmV?d00001 diff --git a/tests/erroneous/data/gender_base.pkl b/tests/erroneous/data/gender_base.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1e0d03054ed32c6984c5edf3004469830e898b44 GIT binary patch literal 1621 zcma*ne^8TU902gaU=YQj)fCi%nLl;|7I_`V5PeV%UmEA|XV50dCMeu7gu$?(5md{B zAvb*)iAYoj%<$w_g#jT)NFpL`z!NwcG}4eAzj6okvG|TOV0w$B zNsU2T-Z+H@gD+HCzX=E0A%QUslhw}nad>%{4v;kUp7y!C5oiF{!dzDg^zehmx_DTt zQS@a_(O560w@K>avR8ge5A?~C_txES02=aRx)^0b1eI|m?G2b3NV=`+*cV?thQt$B zmvy$XfmZjdTu`P%)%mC^0-rZAH4=0x{CdKX)B7kioOd)L$QDZ>`*D@*>pOiE8v5Qg zTixP|=^#nV0-eHM6hNY(+HvwO?6Cc>4##&DB8Y6hw$ShO-wqFv)UTv$dAS;@c8nV= zH=uJnr~B*;FAmTvRRsg9-o{mjN&3Dr5(ZE&wh{(Vhv9OLws4xs9de$Mw^gC%sv7H73x$8?mWizl~uf+raH#iopdMrUiw2GZeJpO4GLnGiuo=yQKuqtD zG{N6+fCpkELLX1nc|gO={8oc*KTwOedUe|xOf4jRt5RYrC;?jWQ`DYk(7A2gTFVmu z36ZVuKJC>7U}`0(ZQymCS#Xy^W@(o(M#Bf1;1#m#Kp{|5IjbqLAJbn*I!eVVex3)q zGjsQ<3?HCjy=tB-Z1qU7DY!-UJEp&qG`Ie_#$7_8QI9I;nK>cQ)!W+jlW^0ROCDbR zs8opQI7u%tLmy5428p##3%x_v0~LInUpQR@5$|LE%_9;_CrH{=tB&!2%fX!6@P~A^ z8)&G;v*+9pG;DnJ;&g`=(@BEb1J>@<#pOX_ZCl)6j4RM{*XG+TFTuxo=Hoc5pTpEf vP@DbR59?I#K-J(f#zjv#(9H*Zqu%I*$x=@2s7dup-f4GRy0ZUAs*?W?kJg7J literal 0 HcmV?d00001 diff --git a/utils/__pycache__/preprocessing.cpython-310.pyc b/utils/__pycache__/preprocessing.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d0d8187dded3ab9a5f8648d7a7b8c294c4c3c80 GIT binary patch literal 14445 zcmb_jTW}lKdEOfq3lIcAiWDhXR@RqTf+|s#;x-QBsF8KCl}HqA$rmAZP!MNHf&{SO z*@Y+~i`FrnO77IPo0%rYQg34ZQVQqHY$=I4@aDa-LPR@HT3xdb*g7^G1pzoR=wP@J`f5 zFGuvU#hfrj@-3y97nVrjJtop(1n+T?5u|qZxx(>|Uv{qfNHQ0y3rIsO^K5kiiSuRYITGLMp^8cs<6i=A6VE9;UK@e0tf)d2 zT7cQvqb%w8)`ann)=>j>Mc*<+;-*URt%NWU(|mJN>F9xWT6uk{WBAFy*i?nSYRIPo zgD8mPs)o3}W#Jn$^c8K7a#K~6wTDGI(61`;(*Z^k=oryRyY81)9lP4F%dKnnVzuVj zi>|aW5L>vFR^4g%hi&QjE!nVZRnNEGMY~e<%YM~u*sIQlm$vcqm&?9gmP}^`^AHhQ zX!&--^@(_wR?!6ITC3d7R+gR0DkwSL48A2z5KW@z3%BLpNwoaMBTt8F`i^?!wuZd9 z_VY)6M!!NTOna?`rgSTg=Y_g#H9}+l-253ys}h>RsbF2hgx9Q9{m`IMhkDIvgl4(f zbQ+?NC|a1T5=~ZUO2=!}{9?YA}7(xolhTq z37f+^>iJdAT{(I>o@?)@aIPM$)SPkyo9JlEuhzVyP3gp)tu~fsnj7I*fAUJ0t!B&5 zf~Gf(V6Tx;)2gXiY6gGfs`iGZ8mghmDP&e?Cc8g6yapcLh60fQn-Cjk+E;uvP_U&m zq0`pcRJX7lu?aQ2O?r2pxw2-*{6kSnYC}x6Ev_EX#GfQLa0)w&zO> z4O`G(#^$3mE<;<3ev7s8(yWawVI#LjCoduq?aNu4=d0AK7ACmN?IZ?u`-6M;?zOWA z4%_q6^_`0Eh=T`0rJXoXIBXvXbr3Bi!s&|JxavrsPxYmyQ>m6~rP%=*$HR2Ld%X5hup6C|k0Y@0cuyd( zk`q`e?VhG_c-P2UKi-yN8F#tP5I><^+v6wSz(=5LDK{177%JYHuLbI+3XY-k0zWRa zEgg%e3nStT@i%x!^sQ+fqmyVnit-6;h)<%d39b^=N^UEaLn#BLJ{zd7MfFUi5C=LN zD3=u901wiplpryMjRcNkwW~uLpjK@>cmvFKSJ+>6Y!6c!@07zf@s{YF?Gc zP-18;hwg@NBDcGiIAXMhIC#8&`yaN?1R^ zis{x}18XIZqrlKQQI}{tG4sqI;wziV7TD%Z+T&{b(D1Af%PQgVXNMRUe-t!HY&jfd zrd9G?E?7{D={Nz^wv9tRj0z<@9@Wg4{5{k^O7Agun$Ld!I;xM8EU=%z&~Y|}j(0*B z-19{H@u8kSpwC5p?(FF>QT8g;YG_uM8m@FCvD<>lgOQ&=JRE5?s&ti#vR@8UC{r)j zs%_kvPJ@yqU0MY#G^ml#piwSb*Oxo`Fc)m@f48SDS~WK%|@%f;7F=UnLR}4F_65!efwawA)IUI(830FGCJ&h zt{=jZg*wszulV}slOF0APr(QiYb{4^02Vqm+}*g?t94*96VS_da4**Z;L1x*d(?OR za_vZj#k??q&epwRVySu+uuYv|j)Gd{dbPIE)(+bRE7U47OsuIQ_|}LpG&mDfK)qf+(?I$n(brn*T|8%YL6``FO_YL=UhEOl}3tlccr z{cs|3>Fi~+^4If88J505hi!pPxQR1BK zvo!`7ElP1A79QTgm^?cFnS_NolJkg$$pvh6!0RDKLF~H666tCNOCY>EhT($=lEAeE z)=0>gge*E_K7_9>kRsL|r4|zHuMbVlvRijVRnFQJu@S>ob>#+63BgzVqW;X<?E4Wo}D&WB9-&SaGR&kQYaF5+!N!K^GlGl|*!b6F- zEUI;ymz8iVbA2FG!nza(?GbQyl!3gAFi6DG^3U(w9=CPLPYG*NeN$aa`sqM}uwbJCQFUAyNu+fF#QZk<=EoQ z6MsoG3GPfQ?MG?f^+!4LOwFy7Yu*V&y2Tkx&b1G|)F>}NWWdq#S{2d*EVgO@r4fW5 z7+w42a}AqVzunVo9JbdYB}EQ?VpeooG!5>N&^YibxAjGVLZ&lohpecZdikKIt;7BWSk-AP*9E|2(?BN zm*z6alH!8|E%^v#j!|%d0z&1XUN1N0ixkVCEx@~&VHl!SZ}QQ{_4zUtzf6VF{UzhJ zLcJ=k$@Fp^nZmn3fO*E6_0C)9nq z1pzyw8)}vXZS!tEkwJXly|iKIX9OoG;( zB!NvFYs*(xNKm$Js&cOnRz)!!+eBc|9U~I&=HB2F4_IZyFu_6{Rt1DE|1yk0SQz?-GF0#75VCus&FN3Z;!+>HpcW zbkd(7%3!>^P%=w&-%~eoJC)_N?vC^2PAb?XCRerhR9Wn;I**tyC@7%cvi!+lbW^2y z>`c%u2onD802duXi0{EDm)tvCQcUgv>%>@%C0;Lo%AXD(NQj4~z)<5W2m2EX`DL>a7jU^*B=)U;BZ-C&-}w%UWZv_UrJ)E8DENBTy^&3VXkP$zw;*}ghe3wPL>&|Bc81;%v8Lq z@{34heu}G#SQfQ$M?HK;4Q}aSN}x@MEsiXv)>@>5gl4grq$uu$TV^qNGLq6l?d-`= zJAbm6xJ)uy(dvo=#Sw^KtaAUnvoY7s6Y>?1cDnps5^!o8B>mk6qrWFlC!{ zSl#hux`9*C4SZFV&{Z%N5JP6crqr9{fy=mCsXf61iAV3D`!t!MS}v%PB%ny~!^#{I zorVTPgXuzw_ep=lkRlBgBvASdD7?UlFsKMt^!t5F=dtkR5~|)Um8G~SpN1q~Ly!mh ze4il2KBax&!-vwt6%Xv3+#s@jJro&yWYA0NH9C$)We4eZGzd+TsZ`2-=ExBiua97w zc=g-IUpe{W+_|~uX6-phUeT^$p?|&X*{~5@1so7P+(BwfyZjgkv*`vo(MF%K=OcJg zFE?5M3>#>rEFp%ll^6!_*D{caW^iSSV^S z(&Cdjh^#*q_ed-alX8GG4O|Z~*o>&(ya^5yHiAZh@*{e*f4aJf9b53axjWIbyJMTN zFE2Zd!{`QZQUizy0E!yHzNNki$6f$RT%9RAB=ea4P;}9NI0B|CitQjGt z(IN3a9DAv;K)~`m;a0XKzzkJ$VE!S=K1?~#yEqaEp7mf6;vyAj1i&6atgt&w#)v*b zSZ68QqrgWHCRy|bD@wEC;uHzu#Y~(y=aL>K34A|K6OhDkN$8WCs9DSn9#)hqjtwSo z)#3H|s??o|#)FD`qIv`3)Er2kqM7fc`W3>wO1-VH!}h8&H?P9;T($yfy*l$-cX$0R;+hVV2S1IAR%fAMz}u zpcF!2!ne~{s}O;X*Uq@~NRqN5d3VRU~aCXvD76dvz8;B*2gGsc%mQf5N+p9H>y<^o6) zG2ly5GWPMMfXmLWooJ_f=}?LS^<{vOG7&Z!$RRwKxdPN6a-cBU8AUA5v84G!8A5&@LNRcrF#PeAT{rcc>ed9IM_|winDO!MjB>5)N34~N&=%`{ei-*< zcPGcRavzaa4z#EaDUW{`Qei9^cP^I#bJavhBT?&#C?jU@pvfPP%Vk zES{s?JV$w~1e`7CPZ*v26@}NTOJRx-`cR26tPhYr_1}A*W@>9%Oasef#dusQdQwRd zdfpQxg0Ubk_5wYRf<_LQcrP$0&El35il)xurANCPRUXMUNd%(#EO#flpfe5jJK|UA{ z(h;)W0VDrl@T{zz5NUv>hud#|nDT{?0^eBM?lQgrW^8ZSZPlPqsCkg@p{pT%GT)3I zU#zE2Uy8oCiJn^KYy?WX7}d51QL0=8|L8+gS)A7~A>8oOFwDA&r+CeSV3a9YxZ@rSevP=LfZlu?G)K2EgaT44B*tH@Hh zNfn_2c4UK-$~Dx4i3=B_*M9YdRvGs*6^3rZxe8l)RIlHGg(|QDNC4DGx8c^I$*M(` z2B@@%h`k6cdQbI?yB$}X*PeRqG5ZvBp=68Va+lm!mT?WWU;fhPT7WI*1T-zbNFc$! zgyvh(e4lN>WO_1xz2#;kqPW`g!SX~;Y6E6?$S0d4gL z4>&=?2I&f>4M@#}X9@SeJBOr5@?3H|u)9fh2hI?}t+`7ZKtxVWc7LzItrt$@Y~O%S@FT06|Q`Prs5(lUK}xQi`XTP43a!e)QfP zYkz8oxsheihLVcbmVgmRJv3j;g3O9AM4pdS@?seu*3Rvg^v?W1$Rq(`D@q#jg&cReWI;_bov zk5uPa%9g~C+Rx2PsPHg8pk>epuI?=Z)#nN{X3v2xkIoVe8`2@VLZ^>*J9{g`M5+DFZD~I+KD8b)5yO(V1~Me@4|lhD^&Ll3cgH1 z59-P{>637ue2apwP{6?Umk=WZ9Bs&A5;%otG40oz7`ADg5jJz~n2<%UPb z>@Zb_vl)Mqrx*GBgd@#XqvA)WJ+j1+GO;+u1S3a_m?9axBV2uhVsBIMYY5=Y1!h^o zN;)3n;=(TZ>y-I21^vwm9ox{i9ufs_$W zu?A8y*A=8`u)Fl@jfynXpt=}=^}ogT`-5+5*EP_}43x@o3cyaU4|#am5S&W`&aD8h zWT8CEA}t@M`Q5a&^MMrr2nQp89-~WUXUx|`4wRHg9SQ)Eg))lhd|%`{<38+FpfR?p z$St_b8G(tfarguzQEFGLtQ#kTqsejceh2k896ePws|%WlsjwPyhOg{k&G^+8G` zmh-J08Hvcc+Cv`Er8;Ejj4H%FEa1ZM-y?m%*}kp(zP!H0d?@RIM{%gs*^@(`=ewV= zj*drir$9{*V?JziM<{AvjE{Qc>2r>ldAZh}jrAUps)1D*q$cnxkjMQ8Dl!-v_W%3Dm09_;mn`YD&OVuu;khf6P4>N8f#36{wuF-aKO4AJ-W*a+6?91^G5IP5 zWRtmNvR@c8Pxkf;bIc4$xe~L&uTfcYTeubJ=H!t7rp)?n9Lx7fvyoLau!l5Q z1sI8*`AK_!pKT5(=XTa;4USlDQDeH?hgZMX@+N(Ln*x%&xAo$i=!N_Pq~AA>W`f!=-a~+1l~v! zq-8$1udCf9=Lz6u`-8^hKTi$xl%uRk{4UWZ+jmwWw_M|+)0KaGKh^L3pOY!c1{U@3 zyA+G3z@zvM-TZSRc#AR-p}G2fic|1W=ipJQI6Mb`L&X2&qtcS!yPwkEb;tc+3-cw^ z@+J`QPBLjHk>BKi8n^(x3G%y2?F740M_nH7_dlXFHY`s-^?ijsNaZi1X8SZ$v-j&} zImJG7=+NBB3uo*L=Pq4>p4PtjqCI~R(Pv3}I(O>gh57SmE}ucE_O2ahgz8-Tc$fN5 zPYW&_kDy~bE;uZ=>V3r^pb+t=J_?fGpikZid>|O~Fv`hxAq%^C9V_OBzsuVb7K+0) zj&Cb)e0z=K3&j-i(n#kxT=^5*Du1H9jk+g)k2+3EgyHu4&l`@nG+{ip2g~0@>^iO9 zc|@#HU>uygac$h3NQ@`OjfsQ-Um>KZ;|6&Rk!nwS1B?;pb1yJchS8C6&ls8hZ^JU~ zWPqRyW8yBnR*3S@ks!uu;W>rJdlqeye-#^mp|raRjt^t6>Kj`Lcw53SL8jwCgV8Jr z&l7#i3=C4F2Gd5OV}e2w$VL+ismVa;De_@?&Fom6 zR43gT>13c*OM-HWD03{$3PPBhfjRV5#rq=u$4I&xhfxkVW=eq~uM#9s4~Ahf?6JRa zEik(&YfqyVp%(IU`Lal|nb~4|WRb!IDoBg+@lLWClFI#-NbOh-W=g1|(qx<#BcPE( zUEnUH!KbB=nTs?R2QjD7u_NI{exCx&u+sk2rOOxRXS+s>*kyxuYtK*uy9ES#X#J&g z1pmDQPFGITK0af&zwkVK7a$ZuA9BoeK&fNB6&!V zXvX-V=M@5@%}d+*b+rmh3ORMu;m+YhvW1JufN=xb6=^&5CJB=8276$iAHPWM(mne{ zb&*;=0NJ!VGS5DKxP6diOLiq;t471MYi?r+nwtnMdA39UH^G}Jq}Z4-Ao?AmnSB!h zIdznw>|w)2->IXJkK`#5i=rd!m0%huT8xZJVgdh$#$AVtS#PKeQ&?D{$tsTbkVZdk zEH(<2XjV#uI*KN!l&YebiR8$rKyid|PxKM0rDCc_teAvTNC_($ebVc*vJTdxJzRa_1+p zts67=f85Bw014bMFVJhhB)5^`WOIXL-)oTXXE<`4{GB5=>0dyB&{(Ru3sj6eWoF2i zZvnnPGt~zEUk^LsMN35sRGdJMd;T;tu}2^~@X1Bw%ip0wFfBXefK*JtT-4l<3zS3u z|A*0J1eA``=ff1(6cDFm-%0k)VJ8^2OR&txY>$UFFoC_xGPItlyP{QdPSD|G#~TZ1 iho@&CYADKdjfsO%<7{9CF!zB-uY8lVftxHAh!vo!{#jfFLNVTyd&W*)quK z>3-e)-s|qy-M{w$Z*sDv;q#CG88`m$9ZmapDvW+IDEt_Y|F=lE#&t)l>*~$w8TB^m zhI(h~S@q7BJ-BOmxfjvdSCIWVceUsJ!k>)u(ut=YFo$ zr+J^gS+@?Xz(@zoa9G-`Gd`;K1D8uz#Ci9&24L65U2Bk@E^bMYE>-}t` z^XzU$z8z^hTEuvc=i7`Icrns#otIu3{alpe6H6K|<6*moyqA_Isf;JTr1i7zUGybB zE#ku~5&JuhPyKy{W6a=pj?cWz^mEW5&$Io)35N*1`Ec%mf~KD(qHF+Ru7AG4#(vC<53P+y%t{NRs#VLtkUpO}`*O-lxL(-CH1 z2afQ~z%!qPLLQh8rRV!nw9J+#O)CsKcsaHo#MzcD{Xk+Wwb*bhKX~;uKCu>Oz%tGy zO<$?%4EOA+K`R7~an2WxXo7)bckLj~wQMI4GR}8}?^|tgI2m|><*Z5Jhfd(fSvwG2 z|1hr!lGPwR6!`^$p942q#&XLEWvgIsp`HwKf}(7Z{LJNVG%E#OfG{t+^P z7GVo7V9D`hUa((O`g))ru}DAG`)o_&hM^hSi;NEAdZKM={)K&A$Ij5TFYpkpnmybr znfTDshwiTH?Ym}MxLnAZiB)sVeLLuw4GU%@tw8vy@qrhDj!IYT32APaP0PI*m@;(D zpkw>yYS6I)>Ml%_0?#uWcAM(W@eg`G9~e8?D9_6;^?sqP^^15)IIkvx zT%^Cy`(<0}PvWTr`JH^kb_zj}(ib|Pe90bZUuOPntv^L4&`u$mTGDn(M-#_%V*Kf| z$9!r?EvM8DpYBh)rC{=;hNBncOh?nvpjAIA^K&@o&bdZZj&NS}&kfs`cm?MdM&6$} z%ttdj6_uae(%fQVW}rJ7K`E(y_V`q9*zl?P zU!3X<+df^3-tz;>Yh?c_S{SK?-3&@f_OHfzkIVk`sovwVe~sRA(QJ!FlY1J*`ESu= zQigRcx>d^arC&1K`%7ww;=7UZy{|tS{`tPwArYFaWVNxREcoo zKV|(({mX78xEy6rckV?dp+uLW%g_pK&}TmFW1&2+`k4JIX`lJD4=9ltT~U4JKhOep zWJXim1kX7-w~)KUuV4-?@vF&67A?|_M!PG>W%#wf0unmLj%w$y$F72M9ee9)RM@$u z=J~2R&lq3fH!#zMA^-VkA?0r>{^pSX@{s@X2>+X@9;!9N9J>6yy#OM@1x46+uVvukbE;*jjGYDwn387D#KTCyWB<& znG?1kVUwDD1`~gBTWVAOm1`2^#ap(>>j#FFKl4 ziofDlncN^TG>uTtu%6rgD^7)*av85U+mkjBI&*t*_E5laiCK@w#Ye!BAH$SHg6WLl z8Slny~U3Ml;iNl!Fw%XtV)3pVhRFq>}E<08&+q29$nTCNXtDdJ+++7+q@fGf~E zY`{x`df+<}($5YMJ z2DiHw)Oar6rjCUk1j%jQk{?phAcK{ch%fmz^-3#r95r*>1n%ooC)1h`vZ}vCWI~Hs zyAiXNRWG!xZDn98mslKgKzD|8_K;ufWQ>!QkQ{NAG2F0P*DXIaX#Ry};%e(<>;$O#$sBznodr z+1Jx}mf4$fmyAo`c!`at6;{#9YzBOn@tiJ|P&WvJJ=;3 z9u{tp>3z83A7;a#wRS(*3AvW&T8>bck^B+H)y9L*7eB5WRDp<9;#{lat(ucGDUxO) zHl(n){9I*sa6nNU!{UCmAe+?2q@+c)83euJ0(N^3rx-XUhYVK=#Cm(o9OYB=`4>F? zJd!!sUj-Ib#;PSx85Gr~Noz9r(A7a@wLV`W_`u>1>P+pI7mrRUPG%PwB=Oy^zwgWAtK5&yCSbDLsEe=M()R5Zdd;kkHSLA)%ig zLqb10hJ=203<>@07!vx~F(mY}V<<+2sF>QQ_1Cz|e;SVMcB9TZfFOjgAOrC( zg-<+yBx!aLE0K0^01+L}!R2St2Z0>~kPtUo17ir1f}$#Wh&Fl(Te)GkfzG5ENl67j z4+36FZF6;V?Y-){`FlV~Cg7S+zzRssgN7cdxF?(gz&F>&cqCw(=h3AMNdsJ!=#DZ4 z9~EQLjH9FUNnr>-k)}i|5r}kovP-pKj>`_95a6pejDyjwQZsQc0^op6ZDcN5&6hqNR z6@kXc)pBFjtZD+|34?nsd;y?w8iMZ;n8PG++oCGMM?NN=7SdBz7sznbs)mcvwGPN1 zsF4WThOkjA5_}6pKVA3}$u_{crY4Z=Ml2PHZB$JgE{NrlM*)beoD&Mh0+zyR#q%0? zyK4>GwInPL6$(VAta`%hIbvN+(-UA$m!CGl)S(-Sfx_qwU+r47&YwA3w@g>T5s1NXd zVEq!9>bLR6eoA-!pW`bTdP?w!qh$-yLejl=ptxIZK`EqpB}L3;ODXaSDUWwM83 z67bk|ZEN?Wcg=i6Yi6VW-8<`dHrMamTfhI_+xOnRPlu`uJOFJEb(@Ku;K;MwHkF66 z01n9al{-S5&~eR&&YrD$+T8}e^#MNW|F7eX@8QO4;YMOA-?*@c9WR6r?ucz#^zD1^ zynA1S=x+&P$Ra6T`vw$qinU^U3Zo2Q%s8h}w%_9z zg}(kD;+U$D#GQXe=!IfG^_+$8HC$dT$sMBYQsPiD1US;AqDM&&Nn8Yg!q0f%^2L>5 z_NNpmAB#-UPrKmVY!4gx|BC+p4J3xXSfOC)$)CZ-DFr6`Xo_zNYIsTdeMa>>0>WXp zcc65~k4AvxEMNXE)e~F!2_>IVvPH=kNaA7wDm_OgA@(xDYmVJm$5&GLy-$XkR8o=z z3>1jQQ@l=wlw6~PemF*I{dh>KAEOjL=z2VK#EMS^?p=!Cx66 TISR)iJbbxS*8ZY!zC8KgkCfP$ literal 0 HcmV?d00001 diff --git a/utils/queries.py b/utils/queries.py index 6985371..a64c4e8 100644 --- a/utils/queries.py +++ b/utils/queries.py @@ -1,3 +1,15 @@ +import numpy as np +import pandas as pd +import pickle +import tqdm as tqdm +import preprocessing +import re +import string +from ethnicolr import pred_fl_reg_name +from urllib.parse import quote +from urllib.request import urlopen +import json + def namesFromXref(cr, doi, title, authorPos): '''Use DOI and article titles to query Crossref for author list''' if authorPos == 'first': @@ -29,35 +41,19 @@ def namesFromXref(cr, doi, title, authorPos): return name -def gender_base(homedir): - """ - for unknown gender, fill with base rates - you will never / can't run this (that file is too big to share) - """ - main_df = pd.read_csv('/%s/data/NewArticleData2019.csv'%(homedir),header=0) - - - gender_base = {} - for year in np.unique(main_df.PY.values): - ydf = main_df[main_df.PY==year].AG - fa = np.array([x[0] for x in ydf.values]) - la = np.array([x[1] for x in ydf.values]) - - fa_m = len(fa[fa=='M'])/ len(fa[fa!='U']) - fa_w = len(fa[fa=='W'])/ len(fa[fa!='U']) - - la_m = len(la[fa=='M'])/ len(la[la!='U']) - la_w = len(la[fa=='W'])/ len(la[la!='U']) - - gender_base[year] = [fa_m,fa_w,la_m,la_w] +def get_gender_base(homedir): + """ + for unknown gender, fill with base rates + you will never / can't run this (that file is too big to share) + """ - gender_base[2020] = [fa_m,fa_w,la_m,la_w] + with open(homedir + 'data/gender_base' + '.pkl', 'rb') as f: + gender_base = pickle.load(f) - with open(homedir + '/data/gender_base' + '.pkl', 'wb') as f: - pickle.dump(gender_base, f, pickle.HIGHEST_PROTOCOL) + return gender_base -def get_pred_demos(authors): +def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', method='florida'): """ :param authors: @@ -79,6 +75,11 @@ def get_pred_demos(authors): race = [] idx = 0 + # save base gender rates + gender_base = get_gender_base(homedir) + # make a dictionary of names so we don't query the same thing twice + full_name_data = {} + first_name_data = {} for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)): if 'author' not in bibfile.entries[paper].persons.keys(): continue # some editorials have no authors @@ -106,6 +107,17 @@ def get_pred_demos(authors): la_fname = la.last_names[0] # for people like Plato la_lname = la.last_names[0] + + fa_fname = preprocessing.convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + fa_lname = preprocessing.convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + la_fname = preprocessing.convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + la_lname = preprocessing.convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate( + str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + + # double check for self cites again if fa_fname.lower().strip() == authors[1].lower().strip(): if fa_lname.lower().strip() == authors[0].lower().strip(): continue @@ -122,48 +134,35 @@ def get_pred_demos(authors): if la_lname.lower().strip() == authors[2].lower().strip(): continue - fa_fname = convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate( - str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') - fa_lname = convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate( - str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') - la_fname = convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate( - str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') - la_lname = convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate( - str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '') + if (fa_lname, fa_fname) in full_name_data: + fa_race = full_name_data[(fa_lname, fa_fname)] + else: + names = [{'lname': fa_lname, 'fname': fa_fname}] + fa_df = pd.DataFrame(names, columns=['fname', 'lname']) + odf = pred_fl_reg_name(fa_df, 'lname', 'fname') + fa_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']] + full_name_data[(fa_lname, fa_fname)] = fa_race + + if (la_lname, la_fname) in full_name_data: + la_race = full_name_data[(la_lname, la_fname)] + else: + names = [{'lname': la_lname, 'fname': la_fname}] + la_df = pd.DataFrame(names, columns=['fname', 'lname']) + odf = pred_fl_reg_name(la_df, 'lname', 'fname') + la_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']] + full_name_data[(la_lname, la_fname)] = la_race + + if fa_fname in first_name_data: + fa_gender, fa_g = first_name_data[fa_fname] + else: + fa_gender, fa_g = gen_api_query(gender_key, fa_fname, gb) + first_name_data[fa_fname] = (fa_gender, fa_g) - names = [{'lname': fa_lname, 'fname': fa_fname}] - fa_df = pd.DataFrame(names, columns=['fname', 'lname']) - asian, hispanic, black, white = pred_fl_reg_name(fa_df, 'lname', 'fname').values[0][-4:] - fa_race = [white, asian, hispanic, black] - - names = [{'lname': la_lname, 'fname': la_fname}] - la_df = pd.DataFrame(names, columns=['fname', 'lname']) - asian, hispanic, black, white = pred_fl_reg_name(la_df, 'lname', 'fname').values[0][-4:] - la_race = [white, asian, hispanic, black] - - url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(fa_fname)) - response = urlopen(url) - decoded = response.read().decode('utf-8') - fa_gender = json.loads(decoded) - if fa_gender['gender'] == 'female': - fa_g = [0, fa_gender['accuracy'] / 100.] - if fa_gender['gender'] == 'male': - fa_g = [fa_gender['accuracy'] / 100., 0] - if fa_gender['gender'] == 'unknown': - fa_g = gb[:2] - - url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(la_fname)) - response = urlopen(url) - decoded = response.read().decode('utf-8') - la_gender = json.loads(decoded) - if la_gender['gender'] == 'female': - la_g = [0, la_gender['accuracy'] / 100.] - - if la_gender['gender'] == 'male': - la_g = [la_gender['accuracy'] / 100., 0] - - if la_gender['gender'] == 'unknown': - la_g = gb[2:] + if la_fname in first_name_data: + la_gender, la_g = first_name_data[la_fname] + else: + la_gender, la_g = gen_api_query(gender_key, la_fname, gb) + first_name_data[la_fname] = (la_gender, la_g) fa_data = np.array( [paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['accuracy']), fa_race[0], @@ -181,7 +180,6 @@ def get_pred_demos(authors): mm, wm, mw, ww = [mm, wm, mw, ww] / np.sum([mm, wm, mw, ww]) gender.append([mm, wm, mw, ww]) - ww = fa_race[0] * la_race[0] aw = np.sum(fa_race[1:]) * la_race[0] wa = fa_race[0] * np.sum(la_race[1:]) @@ -201,9 +199,22 @@ def get_pred_demos(authors): mm, wm, mw, ww = np.mean(gender, axis=0) * 100 WW, aw, wa, aa = np.mean(race, axis=0) * 100 - return mm, wm, mw, ww, WW, aw, wa,aa - -def print_statements(mm, wm, mw, ww, WW, aw, wa,aa): + return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix + +def gen_api_query(gender_key, name, gb): + url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(name)) + response = urlopen(url) + decoded = response.read().decode('utf-8') + gender = json.loads(decoded) + if gender['gender'] == 'female': + g = [0, gender['accuracy'] / 100.] + if gender['gender'] == 'male': + g = [gender['accuracy'] / 100., 0] + if gender['gender'] == 'unknown': + g = gb[:2] + return gender, g + +def print_statements(mm, wm, mw, ww, WW, aw, wa, aa): statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \ are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \ diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \ From 0cc0df3dc0ed05f32ac0855a8d43ecbad044b396 Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 5 Aug 2022 11:02:14 -0400 Subject: [PATCH 23/47] added check for names that have already been queried --- utils/__pycache__/queries.cpython-310.pyc | Bin 10186 -> 10461 bytes utils/queries.py | 10 ++++++++++ 2 files changed, 10 insertions(+) diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc index 1b5e9ece0546b1ca84d2d20afbef247087b12f68..a4cf3c74c2e888a7aca0e059ba566383971bfefb 100644 GIT binary patch delta 1849 zcmb7E&u`;I6rLHo`QbWF;^v2)q^X-UY11|>$33jXf!!No6>&hIMOEpfApz2D>a9fe zMiDA2_OjASJ^~`(9QH5ZjpI0Nw0j_yC-1%Q``&x=#`Z7w z{^%Gdsgy?WSAOlE55B$ilW}*+*hg2#`whI$e)A5Oz8G^=r18$U@j)EeAmJwEdP#Ze z3A;m{D(`nhmK!n#S(^;B$+Az*>FD6J!_!ez52GLRj4+3qo0=m$ZIeqXOfX@_`V4fA zxx~s)g|QB%G$Vn=4V!QkW4Vhs(4r z4D}Z3i?s|& zP?-6MVv{p(ik^0EsACJb2Lst2_r=RvAA>UIFsQ z3{vL=d;bekQy)iHa_Q8*q?8QX(&qM8tVc!JRR>E@<|SV4F|I#hqT*Je!ZC8Gh*ijN zrAR~uR(ZuHxUL3kHCWed(o;nVO0v>emcC8E1}kiVjhYAqKDJQB76LnL0qs||u!b$H z{f`!^P=#?=3R}p7ffHb(BQPLullx3KVjaupgVLtE1`2XkE|n<)>0pm(a9|xr z@dhLY_22+DATEc`8@!1}kcgBP*vG6uFh`BA7<5?4WEfdRIefHjpqenU8aM&TysP;(Z_co@;4(Kqlj-s_|$Jf6E|7+A?$AZn*+qFq9~~Zb_q*AqLVd|pir4=~}u zX^{NvJ!5zv1vB6KCi9|ZQbM$tMJ<}38cX1%QyKCN29L2eMY(A#+_p5B~JCpu>^>Z^#mAan#4~h`UrT_o{ delta 1418 zcma)5OOMk?5bka}lf>gVj_o+{B$LPFJz+Q^4rq~fS#cUh2qd)Hxqw;|6rt?QY%~W{ ztsDl#iHGDP2*n4G_JUaDPpo#~2P}yH0G!aCw%l<%86+fZb-U`TufFQ4PCi`yd&l@K zlhFu1FFwD!b@kkz#?J|33tb)8Pq=>n(Z88^;D1bblP!_u2C!iYvK|>~)75|+(ZgR4 zws|&2^$5MqO<|6-apok#G3%HL6HHjJ1n4~H5s@3Ipo1A&F<;|`N4ScZbvFst(3nWa zzClK5O2QYx>>T^$#CEoG62Nk0EQj_t%dKO% z_3tQGhVnuc%h_OHW9;xU7+`y3mx+p~j*M{?6pT%B$I2Xn3RLl2NWvsHO1g=@E0b!X z9woUj&p`xgP?tIG1u|yS8f1A5MV&q760|D3j#H@c4UdEf6vH0`^6F?Qym6m-#Q#gn zwKgC%Y{=LI+SBfi7<@fWcqty|2Wb_6f^vMT;->rU68I}lwkOsDg&f8D+`duBhN^v$f9e`(QwXa97vRLSd^ F{{hQ$R$%}D diff --git a/utils/queries.py b/utils/queries.py index a64c4e8..4f7ff41 100644 --- a/utils/queries.py +++ b/utils/queries.py @@ -80,6 +80,8 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho # make a dictionary of names so we don't query the same thing twice full_name_data = {} first_name_data = {} + n_gen_queries = 0 + n_race_queries = 0 for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)): if 'author' not in bibfile.entries[paper].persons.keys(): continue # some editorials have no authors @@ -140,6 +142,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho names = [{'lname': fa_lname, 'fname': fa_fname}] fa_df = pd.DataFrame(names, columns=['fname', 'lname']) odf = pred_fl_reg_name(fa_df, 'lname', 'fname') + n_race_queries = n_race_queries + 1 fa_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']] full_name_data[(fa_lname, fa_fname)] = fa_race @@ -149,6 +152,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho names = [{'lname': la_lname, 'fname': la_fname}] la_df = pd.DataFrame(names, columns=['fname', 'lname']) odf = pred_fl_reg_name(la_df, 'lname', 'fname') + n_race_queries = n_race_queries + 1 la_race = [odf['nh_white'], odf['asian'], odf['hispanic'], odf['nh_black']] full_name_data[(la_lname, la_fname)] = la_race @@ -156,12 +160,14 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho fa_gender, fa_g = first_name_data[fa_fname] else: fa_gender, fa_g = gen_api_query(gender_key, fa_fname, gb) + n_gen_queries = n_gen_queries + 1 first_name_data[fa_fname] = (fa_gender, fa_g) if la_fname in first_name_data: la_gender, la_g = first_name_data[la_fname] else: la_gender, la_g = gen_api_query(gender_key, la_fname, gb) + n_gen_queries= n_gen_queries + 1 first_name_data[la_fname] = (la_gender, la_g) fa_data = np.array( @@ -196,6 +202,10 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho citation_matrix = citation_matrix + paper_matrix idx = idx + 1 + # report queries + print(f"Queried gender api {n_gen_queries} times out of {len(bibfile.entries)*2} entries") + print(f"Queried race/ethnicity api {n_race_queries} times out of {len(bibfile.entries)*2} entries") + mm, wm, mw, ww = np.mean(gender, axis=0) * 100 WW, aw, wa, aa = np.mean(race, axis=0) * 100 From 406665f3664f83dacbb7acbbb6d27b1bcf1d07d4 Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 5 Aug 2022 12:03:21 -0400 Subject: [PATCH 24/47] made histogram plotting in python --- cleanBib.ipynb | 103 +++++++------------- tests/aux/pipeline.py | 14 ++- tests/erroneous/pipeline.py | 14 ++- tests/erroneous/testBib_erroneous_clean.bib | 1 - utils/__pycache__/queries.cpython-310.pyc | Bin 10461 -> 10413 bytes 5 files changed, 61 insertions(+), 71 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index bcafb12..3df6c48 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -186,7 +186,6 @@ "source": [ "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n", "\n", - "# TODO: Remove in the PR that gets rid of argparse. \n", "# The following saves the api key to a txt file just to be reloaded by the next cell\n", "with open(\"genderAPIkey.txt\", 'w') as f:\n", " f.write(genderAPI_key)\n", @@ -223,14 +222,13 @@ }, "outputs": [], "source": [ - "from ethnicolr import pred_fl_reg_name\n", "f = open(\"genderAPIkey.txt\", \"r\")\n", "genderAPI_key = f.readline().replace('\\n', '')\n", "\n", "import tensorflow as tf\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", "\n", - "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, gender_key)\n", + "mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key)\n", "statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa)" ] }, @@ -312,74 +310,43 @@ "outputs": [], "source": [ "# Plot a histogram #\n", - "names <- read.csv('/home/jovyan/predictions.csv', header=T)\n", - "total_citations <- nrow(na.omit(names))/2\n", - "names$GendCat <- gsub(\"female\", \"W\", names$GendCat, fixed=T)\n", - "names$GendCat <- gsub(\"male\", \"M\", names$GendCat, fixed=T)\n", - "names$GendCat <- gsub(\"unknown\", \"U\", names$GendCat, fixed=T)\n", - "gend_cats <- unique(names$GendCat) # get a vector of all the gender categories in your paper\n", - "\n", - "# Create an empty data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n", - "dat_for_plot <- data.frame(gender_category = NA,\n", - " number = NA,\n", - " percentage = NA)\n", - "\n", - "\n", - "### Loop through each gender category from your paper, calculate the citation percentage of each gender category, and save the gender category and its citation percentage in dat_for_plot data frame ###\n", - "if (length(names$GendCat) != 1) {\n", - " \n", - " for (i in 1:length(gend_cats)){\n", - " \n", - " # Create an empty temporary data frame that will be binded to the dat_for_plot data frame\n", - " temp_df <- data.frame(gender_category = NA,\n", - " number = NA,\n", - " percentage = NA)\n", - " \n", - " # Get the gender category, the number of citations with that category, and calculate the percentage of citations with that category\n", - " gend_cat <- gend_cats[i]\n", - " number_gend_cat <- length(names$GendCat[names$GendCat == gend_cat])\n", - " perc_gend_cat <- (number_gend_cat / total_citations) * 100\n", - " \n", - " # Bind this information to the original data frame\n", - " temp_df$gender_category <- gend_cat\n", - " temp_df$number <- number_gend_cat\n", - " temp_df$percentage <- perc_gend_cat\n", - " dat_for_plot <- rbind(dat_for_plot, temp_df)\n", - " \n", - " }\n", - " \n", - "}\n", - "\n", + "names = pd.read_csv('/home/jovyan/predictions.csv')\n", + "total_citations = names.CitationKey.nunique()\n", + "names.GendCat = names.GendCat.str.replace('female', 'W', regex=False)\n", + "names.GendCat = names.GendCat.str.replace('male', 'M', regex=False)\n", + "names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False)\n", + "gend_cats = names['GendCat'].dropna().unique() # get a vector of all the gender categories in your paper\n", + "\n", + "# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n", + "#dat_for_plot =\n", + "dat_for_plot = names.groupby('GendCat').size().reset_index()\n", + "dat_for_plot.rename(columns={0:'count'}, inplace=True)\n", + "dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)\n", "\n", "# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n", - "dat_for_baserate_plot <- subset(dat_for_plot, gender_category == 'WW' | gender_category == 'MW' | gender_category == 'WM' | gender_category == 'MM')\n", - "baserate <- c(6.7, 9.4, 25.5, 58.4)\n", - "dat_for_baserate_plot$baserate <- baserate[order(c(which(dat_for_baserate_plot$gender_category == 'WW'), which(dat_for_baserate_plot$gender_category == 'MW'), which(dat_for_baserate_plot$gender_category == 'WM'), which(dat_for_baserate_plot$gender_category == 'MM')))]\n", - "dat_for_baserate_plot$citation_rel_to_baserate <- dat_for_baserate_plot$percentage - dat_for_baserate_plot$baserate\n", - "\n", - "\n", - "# Plot the Histogram of Number of Papers per category against predicted gender category #\n", - "\n", - "library(ggplot2)\n", - "\n", - "dat_for_plot = dat_for_plot[-1:-2,]\n", - "\n", - "dat_for_plot$gender_category <- factor(dat_for_plot$gender_category, levels = dat_for_plot$gender_category)\n", - "ggplot(dat_for_plot, aes(x = gender_category, y = number, fill = gender_category)) +\n", - " geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) + \n", - " scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM', 'UW', 'UM', 'WU', 'MU', 'UU')) +\n", - " geom_text(aes(label = number), vjust = -0.3, color = 'black', size = 2.5) +\n", - " theme(legend.position = 'right') + theme_minimal() +\n", - " xlab('Predicted gender category') + ylab('Number of papers') + ggtitle(\"\") + theme_classic(base_size=15)\n", - "\n", + "dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'WW') |\n", + " (dat_for_plot.GendCat == 'MW') |\n", + " (dat_for_plot.GendCat == 'WM') |\n", + " (dat_for_plot.GendCat == 'MM'),:]\n", + "# MM,MW,WM,WW\n", + "baserate = [58.4, 9.4, 25.5, 6.7]\n", + "dat_for_baserate_plot['baserate'] = baserate\n", + "dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate=\n", + " dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate\n", + " )\n", + "\n", + "# plot\n", + "plt.figure()\n", + "sns.barplot(data=dat_for_plot, x='GendCat', y='count', order=np.flip(gend_cats))\n", + "plt.xlabel('Predicted gender category')\n", + "plt.ylabel('Number of papers')\n", + "plt.tight_layout()\n", "\n", - "# Plot the Histogram of % citations relative to benchmarks against predicted gender category\n", - "ggplot(dat_for_baserate_plot, aes(x = gender_category, y = citation_rel_to_baserate, fill = gender_category)) +\n", - " geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) +\n", - " scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM')) +\n", - " geom_text(aes(label = round(citation_rel_to_baserate, digits = 2)), vjust = -0.3, color = 'black', size = 2.5) +\n", - " theme(legend.position = 'right') + theme_minimal() +\n", - " xlab('Predicted gender category') + ylab('% of citations relative to benchmarks') + ggtitle(\"\") + theme_classic(base_size=15)" + "plt.figure()\n", + "sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='citation_rel_to_baserate', order=['WW','WM','MW','MM'])\n", + "plt.xlabel('Predicted gender category')\n", + "plt.ylabel('% of citations relative to benchmarks')\n", + "plt.tight_layout()" ] }, { diff --git a/tests/aux/pipeline.py b/tests/aux/pipeline.py index c524166..37884f0 100644 --- a/tests/aux/pipeline.py +++ b/tests/aux/pipeline.py @@ -35,4 +35,16 @@ bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) -bib_check(homedir) \ No newline at end of file +bib_check(homedir) + +# queries +try: + f = open("genderAPIkey.txt", "r") + genderAPI_key = f.readline().replace('\n', '') +except: + genderAPI_key = input("Enter genderAPI key:") +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key) +statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa) +print(statement) \ No newline at end of file diff --git a/tests/erroneous/pipeline.py b/tests/erroneous/pipeline.py index c524166..37884f0 100644 --- a/tests/erroneous/pipeline.py +++ b/tests/erroneous/pipeline.py @@ -35,4 +35,16 @@ bib_data = get_duplicates(bib_data, bib_files[0]) # get names, remove CDS, find self cites get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr) -bib_check(homedir) \ No newline at end of file +bib_check(homedir) + +# queries +try: + f = open("genderAPIkey.txt", "r") + genderAPI_key = f.readline().replace('\n', '') +except: + genderAPI_key = input("Enter genderAPI key:") +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix = get_pred_demos((yourFirstAuthor+' '+yourLastAuthor).replace(',',''), homedir, bib_data, genderAPI_key) +statement, statementLatex = print_statements(mm, wm, mw, ww, WW, aw, wa, aa) +print(statement) \ No newline at end of file diff --git a/tests/erroneous/testBib_erroneous_clean.bib b/tests/erroneous/testBib_erroneous_clean.bib index dc0ece8..db73a74 100644 --- a/tests/erroneous/testBib_erroneous_clean.bib +++ b/tests/erroneous/testBib_erroneous_clean.bib @@ -234,4 +234,3 @@ @article{zurn2020network volume = {375}, year = {2020} } - diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc index a4cf3c74c2e888a7aca0e059ba566383971bfefb..eb47eaa92184637f15d4825f0927a31059d9e014 100644 GIT binary patch delta 186 zcmcZ`xHgbCpO=@50SK-byiF3@$Q#VcxN34D>k3BO&0K7p%)Iw&cx%LJMAMig8ET~_ z>u_fBN&!VAYb3xTa+B9_rt${Y@YRS)GQ=>|D%2`Y*5K0TEd+{4fJKyQl{e>bDR3}y zP3{%y*XCp7VG?5GVdP+x0g`-7sC+IUUx0}NCbOSs~0my NO}5vV%`DHu3jiapD4qZS delta 234 zcmZ1*csGzYpO=@50SKfG-X<;D$Q#VccxiGX>k7ue&0K7p%#u%Qcx%LJMAMig8EU0U z7$NLtMn;Co3Y?jeYCu`Z8VRth98ea_2Fp(4OchP8;j0mqWQbv^Rj5@2%8G(zIJoph zTY)kXU>T)aWuS}%NM^GOmjVZ)(BxF1eq$a+K1LoUAx0iX4n`Ru$;X5)!UYuJViaKF m0Lp^do8^VCFf!iS{6%D$5aaX7v(<|kZ%yXcn9Z!r!wUe(DKV!2 From 7c9fe8b453121f9f0368eaa434cad98342d03ef0 Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 5 Aug 2022 12:16:48 -0400 Subject: [PATCH 25/47] fixed bug in histogram --- cleanBib.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 3df6c48..10b697f 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -318,7 +318,6 @@ "gend_cats = names['GendCat'].dropna().unique() # get a vector of all the gender categories in your paper\n", "\n", "# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n", - "#dat_for_plot =\n", "dat_for_plot = names.groupby('GendCat').size().reset_index()\n", "dat_for_plot.rename(columns={0:'count'}, inplace=True)\n", "dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)\n", From a9ab4b1b6fe7b5eb4e50c0af2b6772d2d4b568ae Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 12 Aug 2022 16:46:02 -0400 Subject: [PATCH 26/47] added env file --- tests/immaculate/env_js.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/immaculate/env_js.yml diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml new file mode 100644 index 0000000..6c49a40 --- /dev/null +++ b/tests/immaculate/env_js.yml @@ -0,0 +1,15 @@ +name: cleanBib +channels: + - defaults +dependencies: + - pip + - python + - habanero + - pylatexenc + - pybtex + - bibtexparser + - numpy + - tensorflow=2.8 + - ipykernel + - seaborn +prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib From 694882c9c7830463e29b9af71651cda14ddb00e4 Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 12 Aug 2022 16:47:07 -0400 Subject: [PATCH 27/47] fixed file paths --- cleanBib.ipynb | 51 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 10b697f..1133d54 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -43,11 +43,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "kernel": "Python 3" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No optional .tex file found.\n" + ] + } + ], "source": [ "import glob\n", "from habanero import Crossref\n", @@ -55,7 +63,7 @@ "import os\n", "from pathlib import Path\n", "wd = Path(os.getcwd())\n", - "sys.path.insert(1, f'{wd.parent.parent.absolute()}/utils')\n", + "sys.path.insert(1, f'{wd.absolute()}/utils')\n", "from preprocessing import *\n", "from ethnicolr import pred_fl_reg_name\n", "import tensorflow as tf\n", @@ -146,6 +154,9 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## 3. Estimate gender and race of authors from cleaned bibliography\n", "\n", @@ -171,10 +182,7 @@ "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n", "\n", "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", @@ -400,17 +408,21 @@ ], "metadata": { "kernelspec": { - "display_name": "SoS", - "language": "sos", - "name": "sos" + "display_name": "Python 3.10.5 ('cleanBib')", + "language": "python", + "name": "python3" }, "language_info": { - "codemirror_mode": "sos", - "file_extension": ".sos", - "mimetype": "text/x-sos", - "name": "sos", - "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", - "pygments_lexer": "sos" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" }, "sos": { "kernels": [ @@ -437,8 +449,13 @@ "height": 0 }, "version": "0.20.1" + }, + "vscode": { + "interpreter": { + "hash": "66f30d3a05dff018f3baf45891c3cf21b32f9380ea78dc5d1d8b601d704d86ef" + } } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} From 3753a422470c0d038ceac1c327c14b9e8075353d Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 12 Aug 2022 16:59:53 -0400 Subject: [PATCH 28/47] new environment --- env_js.yml | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 env_js.yml diff --git a/env_js.yml b/env_js.yml new file mode 100644 index 0000000..6b2fdd3 --- /dev/null +++ b/env_js.yml @@ -0,0 +1,173 @@ +name: cleanBib +channels: + - conda-forge + - defaults +dependencies: + - abseil-cpp=20210324.2=he49afe7_0 + - absl-py=1.1.0=pyhd8ed1ab_0 + - aiohttp=3.8.1=py310h1961e1f_1 + - aiosignal=1.2.0=pyhd8ed1ab_0 + - appnope=0.1.2=py310hecd8cb5_1001 + - asttokens=2.0.5=pyhd3eb1b0_0 + - astunparse=1.6.3=pyhd8ed1ab_0 + - async-timeout=4.0.2=pyhd8ed1ab_0 + - attrs=21.4.0=pyhd8ed1ab_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - bibtexparser=1.3.0=pyhd8ed1ab_0 + - blas=1.0=mkl + - blinker=1.4=py_1 + - bottleneck=1.3.5=py310h4e76f89_0 + - brotli=1.0.9=hca72f7f_7 + - brotli-bin=1.0.9=hca72f7f_7 + - brotlipy=0.7.0=py310h1961e1f_1004 + - bzip2=1.0.8=h0d85af4_4 + - c-ares=1.18.1=h0d85af4_0 + - ca-certificates=2022.07.19=hecd8cb5_0 + - cachetools=5.0.0=pyhd8ed1ab_0 + - certifi=2022.6.15=py310hecd8cb5_0 + - cffi=1.15.1=py310h96bbf6e_0 + - charset-normalizer=2.1.0=pyhd8ed1ab_0 + - click=8.1.3=py310h2ec42d9_0 + - colorama=0.4.5=pyhd8ed1ab_0 + - cryptography=37.0.1=py310hf6deb26_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - debugpy=1.5.1=py310he9d5cce_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - entrypoints=0.4=py310hecd8cb5_0 + - executing=0.8.3=pyhd3eb1b0_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.11.0=hd8bbffd_0 + - frozenlist=1.3.0=py310h1961e1f_1 + - gast=0.5.3=pyhd8ed1ab_0 + - giflib=5.2.1=hbcb3906_2 + - google-auth=2.9.1=pyh6c4a22f_0 + - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 + - google-pasta=0.2.0=pyh8c360ce_0 + - grpc-cpp=1.45.2=h360b188_4 + - grpcio=1.45.0=py310h1da61bb_0 + - h5py=3.6.0=py310h6c517f8_0 + - habanero=1.2.2=pyh6c4a22f_0 + - hdf5=1.10.6=hdbbcd12_0 + - icu=70.1=h96cf925_0 + - idna=3.3=pyhd8ed1ab_0 + - importlib-metadata=4.11.4=py310h2ec42d9_0 + - intel-openmp=2021.4.0=hecd8cb5_3538 + - ipykernel=6.9.1=py310hecd8cb5_0 + - ipython=8.4.0=py310hecd8cb5_0 + - jedi=0.18.1=py310hecd8cb5_1 + - jpeg=9e=hac89ed1_2 + - jupyter_client=7.2.2=py310hecd8cb5_0 + - jupyter_core=4.10.0=py310hecd8cb5_0 + - keras=2.8.0=pyhd8ed1ab_0 + - keras-preprocessing=1.1.2=pyhd8ed1ab_0 + - kiwisolver=1.4.2=py310he9d5cce_0 + - krb5=1.19.3=hb49756b_0 + - latexcodec=2.0.1=pyh9f0ad1d_0 + - lcms2=2.12=hf1fd2bf_0 + - libbrotlicommon=1.0.9=hca72f7f_7 + - libbrotlidec=1.0.9=hca72f7f_7 + - libbrotlienc=1.0.9=hca72f7f_7 + - libcurl=7.83.1=h372c54d_0 + - libcxx=14.0.6=hce7ea42_0 + - libedit=3.1.20191231=h0678c8f_2 + - libev=4.33=haf1e3a3_1 + - libffi=3.4.2=h0d85af4_5 + - libgfortran=3.0.1=0 + - libnghttp2=1.47.0=h942079c_0 + - libpng=1.6.37=h5a3d3bf_3 + - libprotobuf=3.20.1=h2292cb8_0 + - libsodium=1.0.18=h1de35cc_0 + - libssh2=1.10.0=h52ee1ee_2 + - libtiff=4.2.0=hdb42f99_1 + - libwebp=1.2.2=h56c3ce4_0 + - libwebp-base=1.2.2=hca72f7f_0 + - libzlib=1.2.12=hfe4f2af_2 + - lz4-c=1.9.3=h23ab428_1 + - markdown=3.4.1=pyhd8ed1ab_0 + - matplotlib=3.5.1=py310hecd8cb5_1 + - matplotlib-base=3.5.1=py310hfb0c5b7_1 + - matplotlib-inline=0.1.2=pyhd3eb1b0_2 + - mkl=2021.4.0=hecd8cb5_637 + - mkl-service=2.4.0=py310hca72f7f_0 + - mkl_fft=1.3.1=py310hf879493_0 + - mkl_random=1.2.2=py310hc081a56_0 + - multidict=6.0.2=py310h1961e1f_1 + - munkres=1.1.4=py_0 + - ncurses=6.3=h96cf925_1 + - nest-asyncio=1.5.5=py310hecd8cb5_0 + - numexpr=2.8.3=py310hdcd3fac_0 + - numpy=1.22.3=py310hdcd3fac_0 + - numpy-base=1.22.3=py310hfd2de13_0 + - oauthlib=3.2.0=pyhd8ed1ab_0 + - openssl=1.1.1q=hca72f7f_0 + - opt_einsum=3.3.0=pyhd8ed1ab_1 + - packaging=21.3=pyhd3eb1b0_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=9.2.0=py310hde71d04_1 + - pip=22.1.2=py310hecd8cb5_0 + - prompt-toolkit=3.0.20=pyhd3eb1b0_0 + - protobuf=3.20.1=py310hd4537e4_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pybtex=0.24.0=pyhd8ed1ab_2 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.11.2=pyhd3eb1b0_0 + - pyjwt=2.4.0=pyhd8ed1ab_0 + - pylatexenc=2.10=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pysocks=1.7.1=py310h2ec42d9_5 + - python=3.10.5=hdaaf3db_0_cpython + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-flatbuffers=2.0=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2022.1=py310hecd8cb5_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py310h1961e1f_4 + - pyzmq=23.2.0=py310he9d5cce_0 + - re2=2022.06.01=hb486fe8_0 + - readline=8.1.2=h3899abd_0 + - requests=2.28.1=pyhd8ed1ab_0 + - requests-oauthlib=1.3.1=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - scipy=1.7.3=py310h3dd3380_0 + - seaborn=0.11.2=pyhd3eb1b0_0 + - setuptools=63.2.0=py310h2ec42d9_0 + - six=1.16.0=pyh6c4a22f_0 + - snappy=1.1.9=h6e38e02_1 + - sqlite=3.39.1=hd9f0692_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - tensorboard=2.8.0=pyhd8ed1ab_1 + - tensorboard-data-server=0.6.0=py310hd6fa1ae_2 + - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 + - tensorflow=2.8.1=cpu_py310h22f808f_0 + - tensorflow-base=2.8.1=cpu_py310h196d2ec_0 + - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0 + - termcolor=1.1.0=pyhd8ed1ab_3 + - tk=8.6.12=h5dbffcc_0 + - tornado=6.1=py310hca72f7f_0 + - tqdm=4.64.0=pyhd8ed1ab_0 + - traitlets=5.1.1=pyhd3eb1b0_0 + - typing-extensions=4.3.0=hd8ed1ab_0 + - typing_extensions=4.3.0=pyha770c72_0 + - tzdata=2022a=h191b570_0 + - urllib3=1.26.10=pyhd8ed1ab_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - werkzeug=2.1.2=pyhd8ed1ab_1 + - wheel=0.37.1=pyhd8ed1ab_0 + - wrapt=1.14.1=py310h6c45266_0 + - xz=5.2.5=haf1e3a3_1 + - yaml=0.2.5=h0d85af4_2 + - yarl=1.7.2=py310h1961e1f_2 + - zeromq=4.3.4=h23ab428_0 + - zipp=3.8.0=pyhd8ed1ab_0 + - zlib=1.2.12=hfe4f2af_2 + - zstd=1.5.2=hcb37349_0 + - pip: + - ethnicolr==0.9.1 + - pandas==1.4.3 +prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib From 0881e21fbe0b8af6589c7f4c59ffc59cf96e5590 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 12 Aug 2022 17:03:44 -0400 Subject: [PATCH 29/47] move yml --- tests/immaculate/env_js.yml | 178 ++++++++++++++++++++++++++++++++++-- 1 file changed, 168 insertions(+), 10 deletions(-) diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml index 6c49a40..6b2fdd3 100644 --- a/tests/immaculate/env_js.yml +++ b/tests/immaculate/env_js.yml @@ -1,15 +1,173 @@ name: cleanBib channels: + - conda-forge - defaults dependencies: - - pip - - python - - habanero - - pylatexenc - - pybtex - - bibtexparser - - numpy - - tensorflow=2.8 - - ipykernel - - seaborn + - abseil-cpp=20210324.2=he49afe7_0 + - absl-py=1.1.0=pyhd8ed1ab_0 + - aiohttp=3.8.1=py310h1961e1f_1 + - aiosignal=1.2.0=pyhd8ed1ab_0 + - appnope=0.1.2=py310hecd8cb5_1001 + - asttokens=2.0.5=pyhd3eb1b0_0 + - astunparse=1.6.3=pyhd8ed1ab_0 + - async-timeout=4.0.2=pyhd8ed1ab_0 + - attrs=21.4.0=pyhd8ed1ab_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - bibtexparser=1.3.0=pyhd8ed1ab_0 + - blas=1.0=mkl + - blinker=1.4=py_1 + - bottleneck=1.3.5=py310h4e76f89_0 + - brotli=1.0.9=hca72f7f_7 + - brotli-bin=1.0.9=hca72f7f_7 + - brotlipy=0.7.0=py310h1961e1f_1004 + - bzip2=1.0.8=h0d85af4_4 + - c-ares=1.18.1=h0d85af4_0 + - ca-certificates=2022.07.19=hecd8cb5_0 + - cachetools=5.0.0=pyhd8ed1ab_0 + - certifi=2022.6.15=py310hecd8cb5_0 + - cffi=1.15.1=py310h96bbf6e_0 + - charset-normalizer=2.1.0=pyhd8ed1ab_0 + - click=8.1.3=py310h2ec42d9_0 + - colorama=0.4.5=pyhd8ed1ab_0 + - cryptography=37.0.1=py310hf6deb26_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - debugpy=1.5.1=py310he9d5cce_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - entrypoints=0.4=py310hecd8cb5_0 + - executing=0.8.3=pyhd3eb1b0_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.11.0=hd8bbffd_0 + - frozenlist=1.3.0=py310h1961e1f_1 + - gast=0.5.3=pyhd8ed1ab_0 + - giflib=5.2.1=hbcb3906_2 + - google-auth=2.9.1=pyh6c4a22f_0 + - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 + - google-pasta=0.2.0=pyh8c360ce_0 + - grpc-cpp=1.45.2=h360b188_4 + - grpcio=1.45.0=py310h1da61bb_0 + - h5py=3.6.0=py310h6c517f8_0 + - habanero=1.2.2=pyh6c4a22f_0 + - hdf5=1.10.6=hdbbcd12_0 + - icu=70.1=h96cf925_0 + - idna=3.3=pyhd8ed1ab_0 + - importlib-metadata=4.11.4=py310h2ec42d9_0 + - intel-openmp=2021.4.0=hecd8cb5_3538 + - ipykernel=6.9.1=py310hecd8cb5_0 + - ipython=8.4.0=py310hecd8cb5_0 + - jedi=0.18.1=py310hecd8cb5_1 + - jpeg=9e=hac89ed1_2 + - jupyter_client=7.2.2=py310hecd8cb5_0 + - jupyter_core=4.10.0=py310hecd8cb5_0 + - keras=2.8.0=pyhd8ed1ab_0 + - keras-preprocessing=1.1.2=pyhd8ed1ab_0 + - kiwisolver=1.4.2=py310he9d5cce_0 + - krb5=1.19.3=hb49756b_0 + - latexcodec=2.0.1=pyh9f0ad1d_0 + - lcms2=2.12=hf1fd2bf_0 + - libbrotlicommon=1.0.9=hca72f7f_7 + - libbrotlidec=1.0.9=hca72f7f_7 + - libbrotlienc=1.0.9=hca72f7f_7 + - libcurl=7.83.1=h372c54d_0 + - libcxx=14.0.6=hce7ea42_0 + - libedit=3.1.20191231=h0678c8f_2 + - libev=4.33=haf1e3a3_1 + - libffi=3.4.2=h0d85af4_5 + - libgfortran=3.0.1=0 + - libnghttp2=1.47.0=h942079c_0 + - libpng=1.6.37=h5a3d3bf_3 + - libprotobuf=3.20.1=h2292cb8_0 + - libsodium=1.0.18=h1de35cc_0 + - libssh2=1.10.0=h52ee1ee_2 + - libtiff=4.2.0=hdb42f99_1 + - libwebp=1.2.2=h56c3ce4_0 + - libwebp-base=1.2.2=hca72f7f_0 + - libzlib=1.2.12=hfe4f2af_2 + - lz4-c=1.9.3=h23ab428_1 + - markdown=3.4.1=pyhd8ed1ab_0 + - matplotlib=3.5.1=py310hecd8cb5_1 + - matplotlib-base=3.5.1=py310hfb0c5b7_1 + - matplotlib-inline=0.1.2=pyhd3eb1b0_2 + - mkl=2021.4.0=hecd8cb5_637 + - mkl-service=2.4.0=py310hca72f7f_0 + - mkl_fft=1.3.1=py310hf879493_0 + - mkl_random=1.2.2=py310hc081a56_0 + - multidict=6.0.2=py310h1961e1f_1 + - munkres=1.1.4=py_0 + - ncurses=6.3=h96cf925_1 + - nest-asyncio=1.5.5=py310hecd8cb5_0 + - numexpr=2.8.3=py310hdcd3fac_0 + - numpy=1.22.3=py310hdcd3fac_0 + - numpy-base=1.22.3=py310hfd2de13_0 + - oauthlib=3.2.0=pyhd8ed1ab_0 + - openssl=1.1.1q=hca72f7f_0 + - opt_einsum=3.3.0=pyhd8ed1ab_1 + - packaging=21.3=pyhd3eb1b0_0 + - parso=0.8.3=pyhd3eb1b0_0 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=9.2.0=py310hde71d04_1 + - pip=22.1.2=py310hecd8cb5_0 + - prompt-toolkit=3.0.20=pyhd3eb1b0_0 + - protobuf=3.20.1=py310hd4537e4_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pybtex=0.24.0=pyhd8ed1ab_2 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.11.2=pyhd3eb1b0_0 + - pyjwt=2.4.0=pyhd8ed1ab_0 + - pylatexenc=2.10=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pysocks=1.7.1=py310h2ec42d9_5 + - python=3.10.5=hdaaf3db_0_cpython + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-flatbuffers=2.0=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2022.1=py310hecd8cb5_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py310h1961e1f_4 + - pyzmq=23.2.0=py310he9d5cce_0 + - re2=2022.06.01=hb486fe8_0 + - readline=8.1.2=h3899abd_0 + - requests=2.28.1=pyhd8ed1ab_0 + - requests-oauthlib=1.3.1=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - scipy=1.7.3=py310h3dd3380_0 + - seaborn=0.11.2=pyhd3eb1b0_0 + - setuptools=63.2.0=py310h2ec42d9_0 + - six=1.16.0=pyh6c4a22f_0 + - snappy=1.1.9=h6e38e02_1 + - sqlite=3.39.1=hd9f0692_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - tensorboard=2.8.0=pyhd8ed1ab_1 + - tensorboard-data-server=0.6.0=py310hd6fa1ae_2 + - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 + - tensorflow=2.8.1=cpu_py310h22f808f_0 + - tensorflow-base=2.8.1=cpu_py310h196d2ec_0 + - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0 + - termcolor=1.1.0=pyhd8ed1ab_3 + - tk=8.6.12=h5dbffcc_0 + - tornado=6.1=py310hca72f7f_0 + - tqdm=4.64.0=pyhd8ed1ab_0 + - traitlets=5.1.1=pyhd3eb1b0_0 + - typing-extensions=4.3.0=hd8ed1ab_0 + - typing_extensions=4.3.0=pyha770c72_0 + - tzdata=2022a=h191b570_0 + - urllib3=1.26.10=pyhd8ed1ab_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - werkzeug=2.1.2=pyhd8ed1ab_1 + - wheel=0.37.1=pyhd8ed1ab_0 + - wrapt=1.14.1=py310h6c45266_0 + - xz=5.2.5=haf1e3a3_1 + - yaml=0.2.5=h0d85af4_2 + - yarl=1.7.2=py310h1961e1f_2 + - zeromq=4.3.4=h23ab428_0 + - zipp=3.8.0=pyhd8ed1ab_0 + - zlib=1.2.12=hfe4f2af_2 + - zstd=1.5.2=hcb37349_0 + - pip: + - ethnicolr==0.9.1 + - pandas==1.4.3 prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib From 450db3f50f3b3761e248d993ffdfabb7c9a86348 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Fri, 12 Aug 2022 17:04:26 -0400 Subject: [PATCH 30/47] move yml --- env_js.yml | 173 ----------------------------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 env_js.yml diff --git a/env_js.yml b/env_js.yml deleted file mode 100644 index 6b2fdd3..0000000 --- a/env_js.yml +++ /dev/null @@ -1,173 +0,0 @@ -name: cleanBib -channels: - - conda-forge - - defaults -dependencies: - - abseil-cpp=20210324.2=he49afe7_0 - - absl-py=1.1.0=pyhd8ed1ab_0 - - aiohttp=3.8.1=py310h1961e1f_1 - - aiosignal=1.2.0=pyhd8ed1ab_0 - - appnope=0.1.2=py310hecd8cb5_1001 - - asttokens=2.0.5=pyhd3eb1b0_0 - - astunparse=1.6.3=pyhd8ed1ab_0 - - async-timeout=4.0.2=pyhd8ed1ab_0 - - attrs=21.4.0=pyhd8ed1ab_0 - - backcall=0.2.0=pyhd3eb1b0_0 - - bibtexparser=1.3.0=pyhd8ed1ab_0 - - blas=1.0=mkl - - blinker=1.4=py_1 - - bottleneck=1.3.5=py310h4e76f89_0 - - brotli=1.0.9=hca72f7f_7 - - brotli-bin=1.0.9=hca72f7f_7 - - brotlipy=0.7.0=py310h1961e1f_1004 - - bzip2=1.0.8=h0d85af4_4 - - c-ares=1.18.1=h0d85af4_0 - - ca-certificates=2022.07.19=hecd8cb5_0 - - cachetools=5.0.0=pyhd8ed1ab_0 - - certifi=2022.6.15=py310hecd8cb5_0 - - cffi=1.15.1=py310h96bbf6e_0 - - charset-normalizer=2.1.0=pyhd8ed1ab_0 - - click=8.1.3=py310h2ec42d9_0 - - colorama=0.4.5=pyhd8ed1ab_0 - - cryptography=37.0.1=py310hf6deb26_0 - - cycler=0.11.0=pyhd3eb1b0_0 - - debugpy=1.5.1=py310he9d5cce_0 - - decorator=5.1.1=pyhd3eb1b0_0 - - entrypoints=0.4=py310hecd8cb5_0 - - executing=0.8.3=pyhd3eb1b0_0 - - fonttools=4.25.0=pyhd3eb1b0_0 - - freetype=2.11.0=hd8bbffd_0 - - frozenlist=1.3.0=py310h1961e1f_1 - - gast=0.5.3=pyhd8ed1ab_0 - - giflib=5.2.1=hbcb3906_2 - - google-auth=2.9.1=pyh6c4a22f_0 - - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 - - google-pasta=0.2.0=pyh8c360ce_0 - - grpc-cpp=1.45.2=h360b188_4 - - grpcio=1.45.0=py310h1da61bb_0 - - h5py=3.6.0=py310h6c517f8_0 - - habanero=1.2.2=pyh6c4a22f_0 - - hdf5=1.10.6=hdbbcd12_0 - - icu=70.1=h96cf925_0 - - idna=3.3=pyhd8ed1ab_0 - - importlib-metadata=4.11.4=py310h2ec42d9_0 - - intel-openmp=2021.4.0=hecd8cb5_3538 - - ipykernel=6.9.1=py310hecd8cb5_0 - - ipython=8.4.0=py310hecd8cb5_0 - - jedi=0.18.1=py310hecd8cb5_1 - - jpeg=9e=hac89ed1_2 - - jupyter_client=7.2.2=py310hecd8cb5_0 - - jupyter_core=4.10.0=py310hecd8cb5_0 - - keras=2.8.0=pyhd8ed1ab_0 - - keras-preprocessing=1.1.2=pyhd8ed1ab_0 - - kiwisolver=1.4.2=py310he9d5cce_0 - - krb5=1.19.3=hb49756b_0 - - latexcodec=2.0.1=pyh9f0ad1d_0 - - lcms2=2.12=hf1fd2bf_0 - - libbrotlicommon=1.0.9=hca72f7f_7 - - libbrotlidec=1.0.9=hca72f7f_7 - - libbrotlienc=1.0.9=hca72f7f_7 - - libcurl=7.83.1=h372c54d_0 - - libcxx=14.0.6=hce7ea42_0 - - libedit=3.1.20191231=h0678c8f_2 - - libev=4.33=haf1e3a3_1 - - libffi=3.4.2=h0d85af4_5 - - libgfortran=3.0.1=0 - - libnghttp2=1.47.0=h942079c_0 - - libpng=1.6.37=h5a3d3bf_3 - - libprotobuf=3.20.1=h2292cb8_0 - - libsodium=1.0.18=h1de35cc_0 - - libssh2=1.10.0=h52ee1ee_2 - - libtiff=4.2.0=hdb42f99_1 - - libwebp=1.2.2=h56c3ce4_0 - - libwebp-base=1.2.2=hca72f7f_0 - - libzlib=1.2.12=hfe4f2af_2 - - lz4-c=1.9.3=h23ab428_1 - - markdown=3.4.1=pyhd8ed1ab_0 - - matplotlib=3.5.1=py310hecd8cb5_1 - - matplotlib-base=3.5.1=py310hfb0c5b7_1 - - matplotlib-inline=0.1.2=pyhd3eb1b0_2 - - mkl=2021.4.0=hecd8cb5_637 - - mkl-service=2.4.0=py310hca72f7f_0 - - mkl_fft=1.3.1=py310hf879493_0 - - mkl_random=1.2.2=py310hc081a56_0 - - multidict=6.0.2=py310h1961e1f_1 - - munkres=1.1.4=py_0 - - ncurses=6.3=h96cf925_1 - - nest-asyncio=1.5.5=py310hecd8cb5_0 - - numexpr=2.8.3=py310hdcd3fac_0 - - numpy=1.22.3=py310hdcd3fac_0 - - numpy-base=1.22.3=py310hfd2de13_0 - - oauthlib=3.2.0=pyhd8ed1ab_0 - - openssl=1.1.1q=hca72f7f_0 - - opt_einsum=3.3.0=pyhd8ed1ab_1 - - packaging=21.3=pyhd3eb1b0_0 - - parso=0.8.3=pyhd3eb1b0_0 - - pexpect=4.8.0=pyhd3eb1b0_3 - - pickleshare=0.7.5=pyhd3eb1b0_1003 - - pillow=9.2.0=py310hde71d04_1 - - pip=22.1.2=py310hecd8cb5_0 - - prompt-toolkit=3.0.20=pyhd3eb1b0_0 - - protobuf=3.20.1=py310hd4537e4_0 - - ptyprocess=0.7.0=pyhd3eb1b0_2 - - pure_eval=0.2.2=pyhd3eb1b0_0 - - pyasn1=0.4.8=py_0 - - pyasn1-modules=0.2.7=py_0 - - pybtex=0.24.0=pyhd8ed1ab_2 - - pycparser=2.21=pyhd8ed1ab_0 - - pygments=2.11.2=pyhd3eb1b0_0 - - pyjwt=2.4.0=pyhd8ed1ab_0 - - pylatexenc=2.10=pyhd8ed1ab_0 - - pyopenssl=22.0.0=pyhd8ed1ab_0 - - pyparsing=3.0.9=pyhd8ed1ab_0 - - pysocks=1.7.1=py310h2ec42d9_5 - - python=3.10.5=hdaaf3db_0_cpython - - python-dateutil=2.8.2=pyhd3eb1b0_0 - - python-flatbuffers=2.0=pyhd8ed1ab_0 - - python_abi=3.10=2_cp310 - - pytz=2022.1=py310hecd8cb5_0 - - pyu2f=0.1.5=pyhd8ed1ab_0 - - pyyaml=6.0=py310h1961e1f_4 - - pyzmq=23.2.0=py310he9d5cce_0 - - re2=2022.06.01=hb486fe8_0 - - readline=8.1.2=h3899abd_0 - - requests=2.28.1=pyhd8ed1ab_0 - - requests-oauthlib=1.3.1=pyhd8ed1ab_0 - - rsa=4.9=pyhd8ed1ab_0 - - scipy=1.7.3=py310h3dd3380_0 - - seaborn=0.11.2=pyhd3eb1b0_0 - - setuptools=63.2.0=py310h2ec42d9_0 - - six=1.16.0=pyh6c4a22f_0 - - snappy=1.1.9=h6e38e02_1 - - sqlite=3.39.1=hd9f0692_0 - - stack_data=0.2.0=pyhd3eb1b0_0 - - tensorboard=2.8.0=pyhd8ed1ab_1 - - tensorboard-data-server=0.6.0=py310hd6fa1ae_2 - - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 - - tensorflow=2.8.1=cpu_py310h22f808f_0 - - tensorflow-base=2.8.1=cpu_py310h196d2ec_0 - - tensorflow-estimator=2.8.1=cpu_py310h7bb394d_0 - - termcolor=1.1.0=pyhd8ed1ab_3 - - tk=8.6.12=h5dbffcc_0 - - tornado=6.1=py310hca72f7f_0 - - tqdm=4.64.0=pyhd8ed1ab_0 - - traitlets=5.1.1=pyhd3eb1b0_0 - - typing-extensions=4.3.0=hd8ed1ab_0 - - typing_extensions=4.3.0=pyha770c72_0 - - tzdata=2022a=h191b570_0 - - urllib3=1.26.10=pyhd8ed1ab_0 - - wcwidth=0.2.5=pyhd3eb1b0_0 - - werkzeug=2.1.2=pyhd8ed1ab_1 - - wheel=0.37.1=pyhd8ed1ab_0 - - wrapt=1.14.1=py310h6c45266_0 - - xz=5.2.5=haf1e3a3_1 - - yaml=0.2.5=h0d85af4_2 - - yarl=1.7.2=py310h1961e1f_2 - - zeromq=4.3.4=h23ab428_0 - - zipp=3.8.0=pyhd8ed1ab_0 - - zlib=1.2.12=hfe4f2af_2 - - zstd=1.5.2=hcb37349_0 - - pip: - - ethnicolr==0.9.1 - - pandas==1.4.3 -prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib From 817337928bf8a281d794cfa9991ca249bb07dc99 Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 12 Aug 2022 17:08:39 -0400 Subject: [PATCH 31/47] trying third env --- tests/env_js.yml | 173 ++++++++++++++++++++++++++++++++++++ tests/immaculate/env_js.yml | 15 ---- 2 files changed, 173 insertions(+), 15 deletions(-) create mode 100644 tests/env_js.yml delete mode 100644 tests/immaculate/env_js.yml diff --git a/tests/env_js.yml b/tests/env_js.yml new file mode 100644 index 0000000..26d4341 --- /dev/null +++ b/tests/env_js.yml @@ -0,0 +1,173 @@ +name: cleanBib +channels: + - conda-forge + - defaults +dependencies: + - abseil-cpp=20210324.2 + - absl-py=1.1.0 + - aiohttp=3.8.1 + - aiosignal=1.2.0 + - appnope=0.1.2 + - asttokens=2.0.5 + - astunparse=1.6.3 + - async-timeout=4.0.2 + - attrs=21.4.0 + - backcall=0.2.0 + - bibtexparser=1.3.0 + - blas=1.0 + - blinker=1.4 + - bottleneck=1.3.5 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - c-ares=1.18.1 + - ca-certificates=2022.07.19 + - cachetools=5.0.0 + - certifi=2022.6.15 + - cffi=1.15.1 + - charset-normalizer=2.1.0 + - click=8.1.3 + - colorama=0.4.5 + - cryptography=37.0.1 + - cycler=0.11.0 + - debugpy=1.5.1 + - decorator=5.1.1 + - entrypoints=0.4 + - executing=0.8.3 + - fonttools=4.25.0 + - freetype=2.11.0 + - frozenlist=1.3.0 + - gast=0.5.3 + - giflib=5.2.1 + - google-auth=2.9.1 + - google-auth-oauthlib=0.4.6 + - google-pasta=0.2.0 + - grpc-cpp=1.45.2 + - grpcio=1.45.0 + - h5py=3.6.0 + - habanero=1.2.2 + - hdf5=1.10.6 + - icu=70.1 + - idna=3.3 + - importlib-metadata=4.11.4 + - intel-openmp=2021.4.0 + - ipykernel=6.9.1 + - ipython=8.4.0 + - jedi=0.18.1 + - jpeg=9e + - jupyter_client=7.2.2 + - jupyter_core=4.10.0 + - keras=2.8.0 + - keras-preprocessing=1.1.2 + - kiwisolver=1.4.2 + - krb5=1.19.3 + - latexcodec=2.0.1 + - lcms2=2.12 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcurl=7.83.1 + - libcxx=14.0.6 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.4.2 + - libgfortran=3.0.1 + - libnghttp2=1.47.0 + - libpng=1.6.37 + - libprotobuf=3.20.1 + - libsodium=1.0.18 + - libssh2=1.10.0 + - libtiff=4.2.0 + - libwebp=1.2.2 + - libwebp-base=1.2.2 + - libzlib=1.2.12 + - lz4-c=1.9.3 + - markdown=3.4.1 + - matplotlib=3.5.1 + - matplotlib-base=3.5.1 + - matplotlib-inline=0.1.2 + - mkl=2021.4.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.1 + - mkl_random=1.2.2 + - multidict=6.0.2 + - munkres=1.1.4 + - ncurses=6.3 + - nest-asyncio=1.5.5 + - numexpr=2.8.3 + - numpy=1.22.3 + - numpy-base=1.22.3 + - oauthlib=3.2.0 + - openssl=1.1.1q + - opt_einsum=3.3.0 + - packaging=21.3 + - parso=0.8.3 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pillow=9.2.0 + - pip=22.1.2 + - prompt-toolkit=3.0.20 + - protobuf=3.20.1 + - ptyprocess=0.7.0 + - pure_eval=0.2.2 + - pyasn1=0.4.8 + - pyasn1-modules=0.2.7 + - pybtex=0.24.0 + - pycparser=2.21 + - pygments=2.11.2 + - pyjwt=2.4.0 + - pylatexenc=2.10 + - pyopenssl=22.0.0 + - pyparsing=3.0.9 + - pysocks=1.7.1 + - python=3.10.5 + - python-dateutil=2.8.2 + - python-flatbuffers=2.0 + - python_abi=3.10 + - pytz=2022.1 + - pyu2f=0.1.5 + - pyyaml=6.0 + - pyzmq=23.2.0 + - re2=2022.06.01 + - readline=8.1.2 + - requests=2.28.1 + - requests-oauthlib=1.3.1 + - rsa=4.9 + - scipy=1.7.3 + - seaborn=0.11.2 + - setuptools=63.2.0 + - six=1.16.0 + - snappy=1.1.9 + - sqlite=3.39.1 + - stack_data=0.2.0 + - tensorboard=2.8.0 + - tensorboard-data-server=0.6.0 + - tensorboard-plugin-wit=1.8.1 + - tensorflow=2.8.1 + - tensorflow-base=2.8.1 + - tensorflow-estimator=2.8.1 + - termcolor=1.1.0 + - tk=8.6.12 + - tornado=6.1 + - tqdm=4.64.0 + - traitlets=5.1.1 + - typing-extensions=4.3.0 + - typing_extensions=4.3.0 + - tzdata=2022a + - urllib3=1.26.10 + - wcwidth=0.2.5 + - werkzeug=2.1.2 + - wheel=0.37.1 + - wrapt=1.14.1 + - xz=5.2.5 + - yaml=0.2.5 + - yarl=1.7.2 + - zeromq=4.3.4 + - zipp=3.8.0 + - zlib=1.2.12 + - zstd=1.5.2 + - pip: + - ethnicolr==0.9.1 + - pandas==1.4.3 +prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib diff --git a/tests/immaculate/env_js.yml b/tests/immaculate/env_js.yml deleted file mode 100644 index 6c49a40..0000000 --- a/tests/immaculate/env_js.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: cleanBib -channels: - - defaults -dependencies: - - pip - - python - - habanero - - pylatexenc - - pybtex - - bibtexparser - - numpy - - tensorflow=2.8 - - ipykernel - - seaborn -prefix: /Users/stisoj/opt/anaconda3/envs/cleanBib From cf988f57ca2b0066196d08bd0837279a73a1a31e Mon Sep 17 00:00:00 2001 From: Stiso Date: Fri, 12 Aug 2022 17:31:21 -0400 Subject: [PATCH 32/47] working on query reporting --- cleanBib.ipynb | 279 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 263 insertions(+), 16 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 1133d54..60d6ac3 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "kernel": "Python 3" }, @@ -70,7 +70,7 @@ "import seaborn as sns\n", "\n", "cr = Crossref()\n", - "homedir = '/home/jovyan/'\n", + "homedir = '/Users/stisoj/Documents/dev/cleanBib/tests/immaculate/'\n", "bib_files = glob.glob(homedir + '*.bib')\n", "paper_aux_file = glob.glob(homedir + '*.aux')\n", "paper_bib_file = 'library_paper.bib'\n", @@ -117,14 +117,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "kernel": "Python 3" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1: buzsaki2013memory\n", + "2: Lundine2019\t\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\n", + "3: zurn2020network\n", + "4: moralia2005\n", + "5: bassett2022curious\n", + "6: fake2022 <-- self-citation\n", + "7: jurafsky2018n\t\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\n", + "8: mitchell2013gendered\n", + "9: chatterjee2021gender\n", + "10: fulvio2021imbalance\n", + "11: ethnicolr2022black\n", + "12: ethnicolr2022hispanic\n", + "13: ethnicolr2022asian\n", + "14: ethnicolr2022white\n" + ] + } + ], "source": [ - "yourFirstAuthor = 'LastName, FirstName OptionalMiddleInitial'\n", - "yourLastAuthor = 'LastName, FirstName OptionalMiddleInitial'\n", + "yourFirstAuthor = 'Stiso, Jennifer'\n", + "yourLastAuthor = 'Zhou, Dale'\n", "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n", "checkingPublishedArticle = False\n", "\n", @@ -143,11 +164,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "kernel": "R" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STOP: Please remove self-citations. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n", + "['fake2022']\n", + "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n", + "['Lundine2019', 'jurafsky2018n']\n", + "Only continue if you've run steps 2, and this code no longer returns errors.\n" + ] + } + ], "source": [ "bib_check(homedir)" ] @@ -186,27 +219,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "kernel": "R" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remaining credits: 262\n", + "This should use (at most) 25 credits, saving you approx 1 credit(s) by storing queries.\n" + ] + } + ], "source": [ - "genderAPI_key = '&key=YOUR ACCOUNT KEY HERE'\n", + "genderAPI_key = '&key='\n", "\n", "# The following saves the api key to a txt file just to be reloaded by the next cell\n", "with open(\"genderAPIkey.txt\", 'w') as f:\n", " f.write(genderAPI_key)\n", "\n", "# Check your credit balance\n", + "authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')\n", "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n", "response = urlopen(url)\n", "decoded = response.read().decode('utf-8')\n", "decoded_json = json.loads(decoded)\n", "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n", - "print('This should use (at most) %d credits, '%len(np.unique(authors_full_list)) + \\\n", - " 'saving you approx %d'%(len(authors_full_list)-len(np.unique(authors_full_list))) + \\\n", - " ' credits if results are stored.')" + "print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \\\n", + " 'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-\n", + " (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \\\n", + " ' credit(s) by storing queries.')" ] }, { @@ -224,11 +268,214 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "kernel": "Python 3" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first author is Jennifer Stiso \n", + "last author is Dale Zhou \n", + "we don't count these, but check the predictions file to ensure your names did not slip through!\n", + "looping through your references, predicting gender and race\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/23 [00:00 Date: Sun, 27 Nov 2022 23:54:11 -0500 Subject: [PATCH 33/47] relax dependencies --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1b88742..ba8ebce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,7 +72,7 @@ pexpect==4.8.0 pickleshare==0.7.5 Pillow==9.1.0 prometheus-client==0.14.1 -prompt-toolkit==3.0.29 +prompt-toolkit protobuf==3.20.1 psutil==5.9.0 ptyprocess==0.7.0 From bd523551ccdde6a597748989be75efddb12fa226 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 00:11:12 -0500 Subject: [PATCH 34/47] relax dependencies --- requirements.txt | 200 +++++++++++++++++++++++------------------------ 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/requirements.txt b/requirements.txt index ba8ebce..247836d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,121 +1,121 @@ -absl-py==0.15.0 -appnope==0.1.3 -argon2-cffi==21.3.0 -argon2-cffi-bindings==21.2.0 -asttokens==2.0.5 -astunparse==1.6.3 -attrs==21.4.0 -backcall==0.2.0 -beautifulsoup4==4.11.1 +absl-py +appnope +argon2-cffi +argon2-cffi-bindings +asttokens +astunparse +attrs +backcall +beautifulsoup4 bibtexparser==1.2.0 -bleach==5.0.0 -cachetools==4.2.4 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -cycler==0.11.0 -debugpy==1.6.0 -decorator==4.4.2 -defusedxml==0.7.1 -entrypoints==0.4 +bleach +cachetools +certifi +cffi +charset-normalizer +cycler +debugpy +decorator +defusedxml +entrypoints ethnicolr==0.8.1 -executing==0.8.3 -fastjsonschema==2.15.3 -flatbuffers==1.12 -fonttools==4.33.3 -folium==0.2.1 -future==0.18.2 -gast==0.4.0 -google-auth==1.35.0 -google-auth-oauthlib==0.4.6 -google-pasta==0.2.0 -grpcio==1.34.1 +executing +fastjsonschema +flatbuffers +fonttools +folium +future +gast +google-auth +google-auth-oauthlib +google-pasta +grpcio h5py==3.1.0 habanero==1.2.0 -idna==3.3 -imgaug==0.2.6 -importlib-metadata==4.11.3 -ipykernel==4.10 -ipython==5.5.0 -jedi==0.18.1 -Jinja2==2.11.3 -jsonschema==4.4.0 -jupyter==1.0.0 -jupyter-client==7.3.0 -jupyter-console==6.4.3 -jupyter-core==4.10.0 -jupyterlab-pygments==0.2.2 -jupyterlab-widgets==1.1.0 +idna +imgaug +importlib-metadata +ipykernel +ipython +jedi +Jinja2 +jsonschema +jupyter +jupyter-client +jupyter-console +jupyter-core +jupyterlab-pygments +jupyterlab-widgets keras==2.8.0 keras-nightly==2.5.0.dev2021032900 Keras-Preprocessing==1.1.2 -kiwisolver==1.4.2 +kiwisolver latexcodec==2.0.1 -Markdown==3.3.6 -MarkupSafe==2.1.1 -matplotlib==3.5.1 -matplotlib-inline==0.1.3 -mistune==0.8.4 -nbclient==0.6.0 -nbconvert==6.5.0 -nbformat==5.3.0 -nest-asyncio==1.5.5 -notebook==5.3.0 +Markdown +MarkupSafe +matplotlib +matplotlib-inline +mistune +nbclient +nbconvert +nbformat +nest-asyncio +notebook numpy==1.21 -oauthlib==3.2.0 -opt-einsum==3.3.0 -packaging==21.3 +oauthlib +opt-einsum +packaging pandas==1.3.5 -pandocfilters==1.5.0 -parso==0.8.3 -pexpect==4.8.0 -pickleshare==0.7.5 -Pillow==9.1.0 -prometheus-client==0.14.1 +pandocfilters +parso +pexpect +pickleshare +Pillow +prometheus-client prompt-toolkit protobuf==3.20.1 -psutil==5.9.0 -ptyprocess==0.7.0 -pure-eval==0.2.2 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 +psutil +ptyprocess +pure-eval +pyasn1 +pyasn1-modules pybtex==0.24.0 -pycparser==2.21 -Pygments==2.12.0 +pycparser +Pygments pylatexenc==2.10 -pyparsing==3.0.8 -pyrsistent==0.18.1 -python-dateutil==2.8.2 -pytz==2022.1 -PyYAML==6.0 -pyzmq==22.3.0 -qtconsole==5.3.0 -QtPy==2.1.0 -requests==2.23.0 -requests-oauthlib==1.3.1 -rsa==4.8 +pyparsing +pyrsistent +python-dateutil +pytz +PyYAML +pyzmq +qtconsole +QtPy +requests +requests-oauthlib +rsa scipy==1.7.3 seaborn==0.11.2 -Send2Trash==1.8.0 -six==1.15.0 -soupsieve==2.3.2.post1 -stack-data==0.2.0 +Send2Trash +six +soupsieve +stack-data tensorboard==2.9.0 tensorboard-data-server==0.6.1 tensorboard-plugin-wit==1.8.1 tensorflow==2.5.2 tensorflow-estimator==2.5.0 -termcolor==1.1.0 -terminado==0.13.3 -tinycss2==1.1.1 -tornado==5.1.0 -tqdm==4.64.0 -traitlets==5.1.1 -typing-extensions==3.7.4.3 -urllib3==1.26.9 -wcwidth==0.2.5 -webencodings==0.5.1 -Werkzeug==1.0.1 -widgetsnbextension==3.6.0 -wrapt==1.12.1 -zipp==3.8.0 +termcolor +terminado +tinycss2 +tornado +tqdm +traitlets +typing-extensions +urllib3 +wcwidth +webencodings +Werkzeug +widgetsnbextension +wrapt +zipp From 28fc386f6fb9382d934845cc0f0539959700e736 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 00:18:39 -0500 Subject: [PATCH 35/47] relax dependencies --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 247836d..17b5c71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ google-auth google-auth-oauthlib google-pasta grpcio -h5py==3.1.0 +h5py habanero==1.2.0 idna imgaug @@ -61,7 +61,7 @@ nbconvert nbformat nest-asyncio notebook -numpy==1.21 +numpy==1.19.2 oauthlib opt-einsum packaging From 8561e75a791e349a94373d90b8c38572b0d0c5ae Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 00:49:34 -0500 Subject: [PATCH 36/47] upgrade ethnicolr --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 17b5c71..6b74e1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ debugpy decorator defusedxml entrypoints -ethnicolr==0.8.1 +ethnicolr==0.9.1 executing fastjsonschema flatbuffers From dc1950fb9aa0efccd9a18bd5c94bede8050989d3 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 00:56:53 -0500 Subject: [PATCH 37/47] update versions --- requirements.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6b74e1b..d4842cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ astunparse attrs backcall beautifulsoup4 -bibtexparser==1.2.0 +bibtexparser==1.3.0 bleach cachetools certifi @@ -31,7 +31,7 @@ google-auth-oauthlib google-pasta grpcio h5py -habanero==1.2.0 +habanero==1.2.2 idna imgaug importlib-metadata @@ -47,8 +47,7 @@ jupyter-core jupyterlab-pygments jupyterlab-widgets keras==2.8.0 -keras-nightly==2.5.0.dev2021032900 -Keras-Preprocessing==1.1.2 +keras-preprocessing==1.1.2 kiwisolver latexcodec==2.0.1 Markdown @@ -61,11 +60,11 @@ nbconvert nbformat nest-asyncio notebook -numpy==1.19.2 +numpy==1.22.3 oauthlib opt-einsum packaging -pandas==1.3.5 +pandas==1.4.3 pandocfilters parso pexpect @@ -100,11 +99,12 @@ Send2Trash six soupsieve stack-data -tensorboard==2.9.0 -tensorboard-data-server==0.6.1 +tensorboard==2.8.0 +tensorboard-data-server==0.6.0 tensorboard-plugin-wit==1.8.1 -tensorflow==2.5.2 -tensorflow-estimator==2.5.0 +tensorflow==2.8.1 +tensorflow-base==2.8.1 +tensorflow-estimator==2.8.1 termcolor terminado tinycss2 From cfc9643080eafa406d65eec678f7933cb69dae7d Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 00:58:07 -0500 Subject: [PATCH 38/47] update python --- environment.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..9794d26 --- /dev/null +++ b/environment.yml @@ -0,0 +1,32 @@ +name: cleanBib +channels: + - conda-forge +dependencies: + - python=3.10.5 + - pip + - pip: + - pybtex==0.24.0 + - numpy==1.19.5 + - bibtexparser==1.2.0 + - pandas==1.4.2 + - pylatexenc==2.10 + - sos + - sos-notebook + - habanero==1.2.0 + - ethnicolr==0.8.1 + - matplotlib==3.5.1 + - seaborn==0.11.2 + - scipy==1.8.0 + - h5py==3.1.0 + - oauthlib==3.2.0 + - rsa==4.8 + - Keras==2.8.0 + - tensorflow==2.5.2 + - protobuf==3.20.1 + - nbgitpuller + - sphinx-gallery + - re + - tqdm + - json + - pickle + - urllib \ No newline at end of file From 7d6205c5960507465bd0d2bd2d0715d0736a7979 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 01:12:29 -0500 Subject: [PATCH 39/47] rm environment.yml --- environment.yml | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 environment.yml diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 9794d26..0000000 --- a/environment.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: cleanBib -channels: - - conda-forge -dependencies: - - python=3.10.5 - - pip - - pip: - - pybtex==0.24.0 - - numpy==1.19.5 - - bibtexparser==1.2.0 - - pandas==1.4.2 - - pylatexenc==2.10 - - sos - - sos-notebook - - habanero==1.2.0 - - ethnicolr==0.8.1 - - matplotlib==3.5.1 - - seaborn==0.11.2 - - scipy==1.8.0 - - h5py==3.1.0 - - oauthlib==3.2.0 - - rsa==4.8 - - Keras==2.8.0 - - tensorflow==2.5.2 - - protobuf==3.20.1 - - nbgitpuller - - sphinx-gallery - - re - - tqdm - - json - - pickle - - urllib \ No newline at end of file From bd898d0a5cafb61b1632fb202cf7510cea7d1920 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 01:16:14 -0500 Subject: [PATCH 40/47] rm r --- runtime.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 runtime.txt diff --git a/runtime.txt b/runtime.txt deleted file mode 100644 index 6a40cb8..0000000 --- a/runtime.txt +++ /dev/null @@ -1 +0,0 @@ -r-3.6-2019-04-12 From e727ae54f7ce201840bb42d78d1a95117beca491 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 01:19:52 -0500 Subject: [PATCH 41/47] update python --- runtime.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 runtime.txt diff --git a/runtime.txt b/runtime.txt new file mode 100644 index 0000000..4fb51b0 --- /dev/null +++ b/runtime.txt @@ -0,0 +1 @@ +python-3.10.5 \ No newline at end of file From 340b50b4068a4b17534a5183bf5dc0b3dcc81aa5 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 01:24:15 -0500 Subject: [PATCH 42/47] rm tf-base --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4842cd..6e56071 100644 --- a/requirements.txt +++ b/requirements.txt @@ -103,7 +103,6 @@ tensorboard==2.8.0 tensorboard-data-server==0.6.0 tensorboard-plugin-wit==1.8.1 tensorflow==2.8.1 -tensorflow-base==2.8.1 tensorflow-estimator==2.8.1 termcolor terminado From be2bfa711e0724af09f90d1d486d81b1737a87ab Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 01:29:21 -0500 Subject: [PATCH 43/47] relax dependencies --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6e56071..3239c9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -103,7 +103,7 @@ tensorboard==2.8.0 tensorboard-data-server==0.6.0 tensorboard-plugin-wit==1.8.1 tensorflow==2.8.1 -tensorflow-estimator==2.8.1 +tensorflow-estimator termcolor terminado tinycss2 From 8944c1fe204505ff6509d72d1e7a214e2119b74e Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 02:12:17 -0500 Subject: [PATCH 44/47] clean up --- diversityStatement/.DS_Store | Bin 6148 -> 0 bytes environment.yaml | 26 ------------------ .../__pycache__/preprocessing.cpython-310.pyc | Bin 14445 -> 0 bytes .../__pycache__/preprocessing.cpython-39.pyc | Bin 14879 -> 0 bytes utils/__pycache__/queries.cpython-310.pyc | Bin 10413 -> 0 bytes utils/__pycache__/queries.cpython-39.pyc | Bin 1893 -> 0 bytes 6 files changed, 26 deletions(-) delete mode 100644 diversityStatement/.DS_Store delete mode 100644 environment.yaml delete mode 100644 utils/__pycache__/preprocessing.cpython-310.pyc delete mode 100644 utils/__pycache__/preprocessing.cpython-39.pyc delete mode 100644 utils/__pycache__/queries.cpython-310.pyc delete mode 100644 utils/__pycache__/queries.cpython-39.pyc diff --git a/diversityStatement/.DS_Store b/diversityStatement/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0YQg34ZQVQqHY$=I4@aDa-LPR@HT3xdb*g7^G1pzoR=wP@J`f5 zFGuvU#hfrj@-3y97nVrjJtop(1n+T?5u|qZxx(>|Uv{qfNHQ0y3rIsO^K5kiiSuRYITGLMp^8cs<6i=A6VE9;UK@e0tf)d2 zT7cQvqb%w8)`ann)=>j>Mc*<+;-*URt%NWU(|mJN>F9xWT6uk{WBAFy*i?nSYRIPo zgD8mPs)o3}W#Jn$^c8K7a#K~6wTDGI(61`;(*Z^k=oryRyY81)9lP4F%dKnnVzuVj zi>|aW5L>vFR^4g%hi&QjE!nVZRnNEGMY~e<%YM~u*sIQlm$vcqm&?9gmP}^`^AHhQ zX!&--^@(_wR?!6ITC3d7R+gR0DkwSL48A2z5KW@z3%BLpNwoaMBTt8F`i^?!wuZd9 z_VY)6M!!NTOna?`rgSTg=Y_g#H9}+l-253ys}h>RsbF2hgx9Q9{m`IMhkDIvgl4(f zbQ+?NC|a1T5=~ZUO2=!}{9?YA}7(xolhTq z37f+^>iJdAT{(I>o@?)@aIPM$)SPkyo9JlEuhzVyP3gp)tu~fsnj7I*fAUJ0t!B&5 zf~Gf(V6Tx;)2gXiY6gGfs`iGZ8mghmDP&e?Cc8g6yapcLh60fQn-Cjk+E;uvP_U&m zq0`pcRJX7lu?aQ2O?r2pxw2-*{6kSnYC}x6Ev_EX#GfQLa0)w&zO> z4O`G(#^$3mE<;<3ev7s8(yWawVI#LjCoduq?aNu4=d0AK7ACmN?IZ?u`-6M;?zOWA z4%_q6^_`0Eh=T`0rJXoXIBXvXbr3Bi!s&|JxavrsPxYmyQ>m6~rP%=*$HR2Ld%X5hup6C|k0Y@0cuyd( zk`q`e?VhG_c-P2UKi-yN8F#tP5I><^+v6wSz(=5LDK{177%JYHuLbI+3XY-k0zWRa zEgg%e3nStT@i%x!^sQ+fqmyVnit-6;h)<%d39b^=N^UEaLn#BLJ{zd7MfFUi5C=LN zD3=u901wiplpryMjRcNkwW~uLpjK@>cmvFKSJ+>6Y!6c!@07zf@s{YF?Gc zP-18;hwg@NBDcGiIAXMhIC#8&`yaN?1R^ zis{x}18XIZqrlKQQI}{tG4sqI;wziV7TD%Z+T&{b(D1Af%PQgVXNMRUe-t!HY&jfd zrd9G?E?7{D={Nz^wv9tRj0z<@9@Wg4{5{k^O7Agun$Ld!I;xM8EU=%z&~Y|}j(0*B z-19{H@u8kSpwC5p?(FF>QT8g;YG_uM8m@FCvD<>lgOQ&=JRE5?s&ti#vR@8UC{r)j zs%_kvPJ@yqU0MY#G^ml#piwSb*Oxo`Fc)m@f48SDS~WK%|@%f;7F=UnLR}4F_65!efwawA)IUI(830FGCJ&h zt{=jZg*wszulV}slOF0APr(QiYb{4^02Vqm+}*g?t94*96VS_da4**Z;L1x*d(?OR za_vZj#k??q&epwRVySu+uuYv|j)Gd{dbPIE)(+bRE7U47OsuIQ_|}LpG&mDfK)qf+(?I$n(brn*T|8%YL6``FO_YL=UhEOl}3tlccr z{cs|3>Fi~+^4If88J505hi!pPxQR1BK zvo!`7ElP1A79QTgm^?cFnS_NolJkg$$pvh6!0RDKLF~H666tCNOCY>EhT($=lEAeE z)=0>gge*E_K7_9>kRsL|r4|zHuMbVlvRijVRnFQJu@S>ob>#+63BgzVqW;X<?E4Wo}D&WB9-&SaGR&kQYaF5+!N!K^GlGl|*!b6F- zEUI;ymz8iVbA2FG!nza(?GbQyl!3gAFi6DG^3U(w9=CPLPYG*NeN$aa`sqM}uwbJCQFUAyNu+fF#QZk<=EoQ z6MsoG3GPfQ?MG?f^+!4LOwFy7Yu*V&y2Tkx&b1G|)F>}NWWdq#S{2d*EVgO@r4fW5 z7+w42a}AqVzunVo9JbdYB}EQ?VpeooG!5>N&^YibxAjGVLZ&lohpecZdikKIt;7BWSk-AP*9E|2(?BN zm*z6alH!8|E%^v#j!|%d0z&1XUN1N0ixkVCEx@~&VHl!SZ}QQ{_4zUtzf6VF{UzhJ zLcJ=k$@Fp^nZmn3fO*E6_0C)9nq z1pzyw8)}vXZS!tEkwJXly|iKIX9OoG;( zB!NvFYs*(xNKm$Js&cOnRz)!!+eBc|9U~I&=HB2F4_IZyFu_6{Rt1DE|1yk0SQz?-GF0#75VCus&FN3Z;!+>HpcW zbkd(7%3!>^P%=w&-%~eoJC)_N?vC^2PAb?XCRerhR9Wn;I**tyC@7%cvi!+lbW^2y z>`c%u2onD802duXi0{EDm)tvCQcUgv>%>@%C0;Lo%AXD(NQj4~z)<5W2m2EX`DL>a7jU^*B=)U;BZ-C&-}w%UWZv_UrJ)E8DENBTy^&3VXkP$zw;*}ghe3wPL>&|Bc81;%v8Lq z@{34heu}G#SQfQ$M?HK;4Q}aSN}x@MEsiXv)>@>5gl4grq$uu$TV^qNGLq6l?d-`= zJAbm6xJ)uy(dvo=#Sw^KtaAUnvoY7s6Y>?1cDnps5^!o8B>mk6qrWFlC!{ zSl#hux`9*C4SZFV&{Z%N5JP6crqr9{fy=mCsXf61iAV3D`!t!MS}v%PB%ny~!^#{I zorVTPgXuzw_ep=lkRlBgBvASdD7?UlFsKMt^!t5F=dtkR5~|)Um8G~SpN1q~Ly!mh ze4il2KBax&!-vwt6%Xv3+#s@jJro&yWYA0NH9C$)We4eZGzd+TsZ`2-=ExBiua97w zc=g-IUpe{W+_|~uX6-phUeT^$p?|&X*{~5@1so7P+(BwfyZjgkv*`vo(MF%K=OcJg zFE?5M3>#>rEFp%ll^6!_*D{caW^iSSV^S z(&Cdjh^#*q_ed-alX8GG4O|Z~*o>&(ya^5yHiAZh@*{e*f4aJf9b53axjWIbyJMTN zFE2Zd!{`QZQUizy0E!yHzNNki$6f$RT%9RAB=ea4P;}9NI0B|CitQjGt z(IN3a9DAv;K)~`m;a0XKzzkJ$VE!S=K1?~#yEqaEp7mf6;vyAj1i&6atgt&w#)v*b zSZ68QqrgWHCRy|bD@wEC;uHzu#Y~(y=aL>K34A|K6OhDkN$8WCs9DSn9#)hqjtwSo z)#3H|s??o|#)FD`qIv`3)Er2kqM7fc`W3>wO1-VH!}h8&H?P9;T($yfy*l$-cX$0R;+hVV2S1IAR%fAMz}u zpcF!2!ne~{s}O;X*Uq@~NRqN5d3VRU~aCXvD76dvz8;B*2gGsc%mQf5N+p9H>y<^o6) zG2ly5GWPMMfXmLWooJ_f=}?LS^<{vOG7&Z!$RRwKxdPN6a-cBU8AUA5v84G!8A5&@LNRcrF#PeAT{rcc>ed9IM_|winDO!MjB>5)N34~N&=%`{ei-*< zcPGcRavzaa4z#EaDUW{`Qei9^cP^I#bJavhBT?&#C?jU@pvfPP%Vk zES{s?JV$w~1e`7CPZ*v26@}NTOJRx-`cR26tPhYr_1}A*W@>9%Oasef#dusQdQwRd zdfpQxg0Ubk_5wYRf<_LQcrP$0&El35il)xurANCPRUXMUNd%(#EO#flpfe5jJK|UA{ z(h;)W0VDrl@T{zz5NUv>hud#|nDT{?0^eBM?lQgrW^8ZSZPlPqsCkg@p{pT%GT)3I zU#zE2Uy8oCiJn^KYy?WX7}d51QL0=8|L8+gS)A7~A>8oOFwDA&r+CeSV3a9YxZ@rSevP=LfZlu?G)K2EgaT44B*tH@Hh zNfn_2c4UK-$~Dx4i3=B_*M9YdRvGs*6^3rZxe8l)RIlHGg(|QDNC4DGx8c^I$*M(` z2B@@%h`k6cdQbI?yB$}X*PeRqG5ZvBp=68Va+lm!mT?WWU;fhPT7WI*1T-zbNFc$! zgyvh(e4lN>WO_1xz2#;kqPW`g!SX~;Y6E6?$S0d4gL z4>&=?2I&f>4M@#}X9@SeJBOr5@?3H|u)9fh2hI?}t+`7ZKtxVWc7LzItrt$@Y~O%S@FT06|Q`Prs5(lUK}xQi`XTP43a!e)QfP zYkz8oxsheihLVcbmVgmRJv3j;g3O9AM4pdS@?seu*3Rvg^v?W1$Rq(`D@q#jg&cReWI;_bov zk5uPa%9g~C+Rx2PsPHg8pk>epuI?=Z)#nN{X3v2xkIoVe8`2@VLZ^>*J9{g`M5+DFZD~I+KD8b)5yO(V1~Me@4|lhD^&Ll3cgH1 z59-P{>637ue2apwP{6?Umk=WZ9Bs&A5;%otG40oz7`ADg5jJz~n2<%UPb z>@Zb_vl)Mqrx*GBgd@#XqvA)WJ+j1+GO;+u1S3a_m?9axBV2uhVsBIMYY5=Y1!h^o zN;)3n;=(TZ>y-I21^vwm9ox{i9ufs_$W zu?A8y*A=8`u)Fl@jfynXpt=}=^}ogT`-5+5*EP_}43x@o3cyaU4|#am5S&W`&aD8h zWT8CEA}t@M`Q5a&^MMrr2nQp89-~WUXUx|`4wRHg9SQ)Eg))lhd|%`{<38+FpfR?p z$St_b8G(tfarguzQEFGLtQ#kTqsejceh2k896ePws|%WlsjwPyhOg{k&G^+8G` zmh-J08Hvcc+Cv`Er8;Ejj4H%FEa1ZM-y?m%*}kp(zP!H0d?@RIM{%gs*^@(`=ewV= zj*drir$9{*V?JziM<{AvjE{Qc>2r>ldAZh}jrAUps)1D*q$cnxkjMQ8Dl!-v_W%3Dm09_;mn`YD&OVuu;khf6P4>N8f#36{wuF-aKO4AJ-W*a+6?91^G5IP5 zWRtmNvR@c8Pxkf;bIc4$xe~L&uTfcYTeubJ=H!t7rp)?n9Lx7fvyoLau!l5Q z1sI8*`AK_!pKT5(=XTa;4USlDQDeH?hgZMX@+N(Ln*x%&xAo$i=!N_Pq~AA>W`f!=-a~+1l~v! zq-8$1udCf9=Lz6u`-8^hKTi$xl%uRk{4UWZ+jmwWw_M|+)0KaGKh^L3pOY!c1{U@3 zyA+G3z@zvM-TZSRc#AR-p}G2fic|1W=ipJQI6Mb`L&X2&qtcS!yPwkEb;tc+3-cw^ z@+J`QPBLjHk>BKi8n^(x3G%y2?F740M_nH7_dlXFHY`s-^?ijsNaZi1X8SZ$v-j&} zImJG7=+NBB3uo*L=Pq4>p4PtjqCI~R(Pv3}I(O>gh57SmE}ucE_O2ahgz8-Tc$fN5 zPYW&_kDy~bE;uZ=>V3r^pb+t=J_?fGpikZid>|O~Fv`hxAq%^C9V_OBzsuVb7K+0) zj&Cb)e0z=K3&j-i(n#kxT=^5*Du1H9jk+g)k2+3EgyHu4&l`@nG+{ip2g~0@>^iO9 zc|@#HU>uygac$h3NQ@`OjfsQ-Um>KZ;|6&Rk!nwS1B?;pb1yJchS8C6&ls8hZ^JU~ zWPqRyW8yBnR*3S@ks!uu;W>rJdlqeye-#^mp|raRjt^t6>Kj`Lcw53SL8jwCgV8Jr z&l7#i3=C4F2Gd5OV}e2w$VL+ismVa;De_@?&Fom6 zR43gT>13c*OM-HWD03{$3PPBhfjRV5#rq=u$4I&xhfxkVW=eq~uM#9s4~Ahf?6JRa zEik(&YfqyVp%(IU`Lal|nb~4|WRb!IDoBg+@lLWClFI#-NbOh-W=g1|(qx<#BcPE( zUEnUH!KbB=nTs?R2QjD7u_NI{exCx&u+sk2rOOxRXS+s>*kyxuYtK*uy9ES#X#J&g z1pmDQPFGITK0af&zwkVK7a$ZuA9BoeK&fNB6&!V zXvX-V=M@5@%}d+*b+rmh3ORMu;m+YhvW1JufN=xb6=^&5CJB=8276$iAHPWM(mne{ zb&*;=0NJ!VGS5DKxP6diOLiq;t471MYi?r+nwtnMdA39UH^G}Jq}Z4-Ao?AmnSB!h zIdznw>|w)2->IXJkK`#5i=rd!m0%huT8xZJVgdh$#$AVtS#PKeQ&?D{$tsTbkVZdk zEH(<2XjV#uI*KN!l&YebiR8$rKyid|PxKM0rDCc_teAvTNC_($ebVc*vJTdxJzRa_1+p zts67=f85Bw014bMFVJhhB)5^`WOIXL-)oTXXE<`4{GB5=>0dyB&{(Ru3sj6eWoF2i zZvnnPGt~zEUk^LsMN35sRGdJMd;T;tu}2^~@X1Bw%ip0wFfBXefK*JtT-4l<3zS3u z|A*0J1eA``=ff1(6cDFm-%0k)VJ8^2OR&txY>$UFFoC_xGPItlyP{QdPSD|G#~TZ1 iho@&CGTnRn)$=Y8f?X?QrJ;P-w1+l&7>uPFaS5B)zK56|K9 z{tx@%RnrrDYrrRlcL?+n|(U9To;rftUWl6I2crR)^$Mm1f_*qQj5vEAKtJryw5^3B=MMezaJ|?na825c5Cq{4|7kM#?`-B)1`*7bc z#>E8glj5^tQXF_!wGW6XaS$m};t_EO_k-e5aTxbU#Ak$s`yug|cpUdfg?dFPJaHHO zROSn6IC8P%JJ;u3;drO6I@f(9nTwT0q#>4lzOsnKg_86fiD&g#S*3#UFM+#>>ntws zml62Nswz~W1sI(vWm(6wCX5fXwi>9b`i>zIw^fSoB!r2W=9}9}TMx8z%3CvS!%qgr zwkq^BL(T>U)gY2<8shp+3eOm!U)83R+p4Op&x%Z-ZzysjKyL#5iXzhJ)ljSEm)0Ds zQnyOY>()}G>R3yzw9pw#xaDTesr%EGbo{2QTh)r^TkeuouJ|Rt;?}J-XUof2`1va( z-zrJ2X9i;tRkYajt-9+|<(;~06E#*R1+>zFBX2jwqI-ENvBKY11AXNLi}XOVv=l zen*iL_{zOsIPoebh?vhL}1t zzT&Haf=Q(bohHw=x`PRcX{h0D(!H(KPa!u=v^RbIC&+F1#ye`DVY;PSr3_D)*+xm0 zYF52eb7n2im*^NKpud8tM`@ggCKmmcs-@*w3lqY^+bXTSsETM-&RRTL#cr`M!Flc` z(W$#XzJLFID|cktT9B^qlzm4WJrXLd#F4_ZbtKeLX(16FD7*CyNBX?3uQZ%;rBpq; zf)yOp+OA~Gri`qfIdfMDy$YeJhurINdH9Y!t#xxO5zZ)#jF zixip=?+3~i#W$dPv`HmMOk(=5*ee5KuU6_07yE2SuKtQ+c^JreeoR}WPc|$53e=5b z)ha?jS$e*S7GzBvhoS@(-3P_vw)>Jqe&7MdgK1t*vGDNB|mdGx7oWlD7c~E0$ z+OTTOBH}CC$_|v`Z5j`?b!>1vNFfz*`LhFRi607@B&8b;vq~!ZE@v#Lb|y|hv0Xio zkD@>km&fYbZXbc;%OBEaTc5vh zE=-iXa-|ZQmF2oC9ZAZxU~*^V=MWEvn)M28lcMC8!Zh;KO4UjW`=L{(WJw!RK?@D~ zNN7;5-TkzXV+EH-eNoJ;DlOaxHmMZ7z{NfNXG8>6gV2^quR%Ah>2iVaQ?w?QG3$jH zC>6~04~#bHpcFJ%8kY)uv=BdM=oyzWML0?aj(WeH2ohY|RosWB6l`?Bk{CJtHs@`d zNY6bWJqb;h6eG}dCTTjV&o}(Uwz{Es|G;sRp8B4XeR0#r6FtkTv|@h_T6-Na%&~my zF9#$g+{&zV?(!UguSRnbh-AePz*~L=n@cq7qW#pb_*H0D(vKXa0$nvakiVm3J*T>K z7AWj2!Lg`hy;)myBo(EXU9{&hNZk8<>u9Aeoa<=N;ubYBTIRf>AA!h19ch40yt(rl z4|VjXV1$YFrX#li`J5_tW^CJ)8t|40Smis|b!z};rDdlz?7M!cdOX5eUYI~*Yo480 zu5193sWYrgP^wg`RJU5%v{gujT3LpPO;@gY@-t{cq*6%lh9g^UQ(hpAekN*=MX5PT zy$FRLB^FZh3zVGTF|>yV#>h@aUGd}#lt-;Zy689_cFpi`f;K1bf^=(dNN4FvJksmP z>5(Q=l7?n#DfOV5SI2QrsR!_v$DeuM#7?2zH|}d%?qf~QsFJkU9xXsS2d@oMogLt| zt*SAEgEqkYh3$Yfb6eden+&+WV6;vS%&tbM;mBeK(au_p5^b?A83Y7zS)Q%2R~g^- zX2Y!2(!Owd9%v`g0DSWqVup#b+pPOo__@Ja?U5J_SL#K6vwN&Y0gy#8&ctTH2k4VW zt6xcEF*b4mu`s!ai4NdApeA&@=_1Fil>V<-4=gMxa=GKc5X7=m@ zk%BJP-MZ(O>Y^k?(cfw~ULhfAOzaFdF)~&R7xs&!;R?-#$U@b_6bYGTYG_8i2~(YK z?-o*a+3|~=m%oW>Jmv^en7KOejV515+JIx91j4(*VcJS(hzsKLo`Du~bR^L(AbABe!U<8?K#w}Go=?{st$n0eOVU*Uk zv6|4%A=3FOO}3=S(s#x1q|!G1Twsdaw)V9{?W8}llMj?2c^hl*b=c+x=IOSsD)q6G z$}QE)Kc%SrcMYLGigF{{T3e}S5Lcr(VKM9nn>-EJRPT)cv;}cnkccIkuvE*yA(1g2 z+g6`6d%GCBQyJrGn3B8eY9)`%ojWpX9l56-VdcF?2hk+kN75zGyA165R-UX7p-d?^ zl{M|_+PZ-rsJ1qDj;9lJgEsS&97pauYN&%kh-MjCL}BJxvbHhB8>jyi&;Nl-X+2J} zuGhzzXRB_xRP|0H(#gyKa=vx+)p}_WMgbO<*DRBDztpS(h(;j2uWzl_=IRzHeXGk> zOk0~oeKawqG_J8p(JNzt36T}Yi|7$-uuE>W>Tc3nkKX|Q@n+Oo&RBD_?_!zLKL8?0 zs$<$}z^;cLfKu`4>%EUti)ll%fU2-7S`~phD@zqH1pK{Hwc2@woy3shB@rldW$FcxOxMioT`d#gcA4MXWL8NZMq^%OaVAk^v&Y?mvjmgo#JrQ~Du za+HEg6c7>*^;)SRU!hnQ-vY3U)rBFNwFWPJY@DxA_N$aB(-RqgE7U9Ey8I$u*~z-I zNqWQ{iYj956iox0%JW>v4T}BSc;OND$F`~Fuu($c=!Z4h8picQYDUXp%gkZJ)IK&7 zc`zT-dRD{6xrP#DQa_y^RX&7^Rnp%ggi6wJLH)E7xJ-i6?IZzAEa#N3u96X)x~%|U)=7;sf3h{`72 zzXaLtyCh;D4xw^B>d&{66T>hIw4D(#f?7v~v6GJo(_l1Gsz@_~(G936`BX59FQ69s zLVGkALY<@0C=j_CkHWFOQ5Xk-$^8#Tm-qHQQho#dhw4)SfX(&qt$!pY+GD}cIehu8 z>+dP8F(^arCVD3JZyRm>b*28eq4;CB)UCgdl&8N>07k3IHI!KY@S0NBp6V|(iBd+l zl(v=ngi<+woW2Alysxjs?88g&Jm`$TciZV;pO{$FZfWud-5J4S!P^!H=XX>7L@>Oq z(ipeLX@&#|e}91O40r9Ly;url3wS0>(1S(x3DX1TZYbBa=|F@iv6HixEuJ z5AfTu-dk}f%7G22Hv>rqn)h-491O;|28`_bhyGNMMGX_!{&e|Q{^T|`s341(x6z&$ z=<|yYlsZgKVFVKoZgDX2koM@_R|nb?_vn_sJ{`5d2Sx86h}yPITkv733EU#ciX}pD zpJ`p}^Q0&RNf510z*N-?*nv-&2D-dYBgYCJY%z+UfNsk&a*z?IcC9SKKk}cesH71#QpZra2m!?EfrKLsWLVenbILMYM?9B zjV{C>o|GZRIA}+p_M1we!F)CXHG&zvHs8^?AH3a!s&_|ai!ORk-H~r1$m6?FM~-}Q zN4hxU(`cG((f0!ix}(d@8S6CXo(AJ;BArfK&mKR{M)+|IAkT=_sn^fEGCw!}!mKq9 z3oV*SObu+7JPW4;8-Na?i&2Qgv`U{v#e8^!H__yuu@)jIQY+P)02f>MN=d?C;j^C= z2OuS`+ncfGE?8&g&smq}FJ7ZK!zQ$ckPp{(1Q>-ZYQ{~`7b7O04ut6Zr|ce&ZDyhn zh>zfMh~dDI#U_9^(UgzzD>a#k1o3XC0IQ3~X1DL7~bOo?Q zAHi&ezUTBaJ3^MCF5T+*`5*8yLohGiiDQSI&!!N&j`Re)N zrMdaJOJ^>IsVLdql#7&~h~@}ir2&*cCE&!2=?H)x%?JFA~P( z^8_43RUI4v5d#pW9k4Tw1lDJ^EJ9~Qkw#$cF~kb{!(@#8BQ$n_Ui%a@5rj!L(V?C) zOmCbegWb-?iSsV8HAz?k3p4;pbeGIS`3_3jBmK)5`RvjD1TJb1VK#LZdYB-rxn-wo zTd}>0?~pQ!Gcc)ZXX2SQ->mgfE!?UREXz3W9C1vc0jj7LEgN1yteN&bVDbI{?e%^g zfnplKh4hCLF`&ha2C_w+#`zfiWq>4!m>H|lOS!UIe=HZL z<+;Ru;ha!~J_lD47 zHo{{;R*ayQ;r1|Md5-1$oIgTXNQ{axGM_;KfZ9a$YjP~^D+;~c%1h0IOQP`>~HiZjCMXo#U!^*Et7=h2Xc+ZvVvtm z{qS~Z)OHSt13>bvNc89T>ettS{0Z{|C!WTo_=f@L1sGziz2o#c;!5TIYyS-^RVw8vT_2`f0Sk?LP$2q2V+4dLf`#3dOZM+*1bRJ zUrXzO$N*R!YQ6VKqzz#T-oJM3Y`h)JSl)`;tb&TDdSC{?@el{i`=rO)?Adcyq9^`D zm(IBm!PX9nwXA*|E342iJ@6`P^ypI1pn!O3=^>aeKb+-lvtQwt8fC{5ZtR*n- zUBVhWJvKV8HTmXet+U`z>HLcGU2$Ju!KT!D?d_ME05Nj{9L+BhWU#K_^UY|yFErr+ z!(e2A&R?RE)_HI|Wp2d6N~P+0ZewL@);eQxh1LS`>h0FG7kI#cy}F$TxS(#M>N186 zxXp#93Aev9hD1SmE;%Atourxr_Xy!u-Q_LdBd037*2$=9>l_QG;%{-29dg=8b?ucB*GeBig&2gNT#{TTOUwhE#Y4=;@sVtQ`qrFm{rm&QMwU=F z=q-{h!8GnGO*~@0yHabNj{7bu9^e)A9r-&W%!OAf%PYsxy9O$>Rzdc`1E&jXThGVk zk5+1hzM0&q0a&`P9c6d94hdl@_RsXziFqK@|y&{-GU)Syn(_QR(6aw_VgZc@v_9NB2n#zr<;5C6xO?5vw z&5ZgaISSFGbz=tM)^RYPoX+o<5q(gqu7N*|Yrs7PG)y>`>d&Z;_i!#S?7)+@pHVH` zl+^wAnP5meDohSD%L$oe`LL9)l}E5;^xp3>mn*-EYR@ zH_s7EI#A2aJ$d)XdB=CZA+|Ia14fx_{ap{EL^e8a+R@n(9m#fqN3^F7oR3BYVz(D4 zVL0>=IdGwOyuRmM?~r}q)kBx!K&}gC1|BbV9%B-ZdvdQppCG+FZE-^=YF&<(dgK-~ zN4mUJZOz6cNW?oZuR)xFSB7!!Ka|yAfHpt`X_)V{4ru;+q?DPXdYX>kNyqCu#Bx3g|HBj>#@#tUTEfFdSiJNK8wt3V(z0 zlBdI+h`y6=Q(R>|w~J)@kkky(962m_sFM5eGJwaScE5GFr#AaIya!WXpK6c6)g7V^ zbbBm+j~;JR@J$N3>XK=)Z#=9Adt^dRODvP85n-83vrKSW&@Ypl3d;gIYPT%pw@~m- zV%PY~9i&Ek6SWbPX_FFYQ`yzjZw)q8qrujUkG1=In)<0tA8vAQ05J((+}p12P`k27 z&w-oBfVcQRZIk?iy)F9g;7IkR<(`rHbc8B@pV|eT{ z_O$YUQJ#;=CIQL&2n8+@IK{ZPHeZosmQdRynfsPCoJImJ45?AZL7OXsaib62i_Ubim4Vl7-o z^m!t^=g(fgv~cnK)$_>J+V_AuLUq1%s#E(yels0CLCB_qD28D!;Q!2&j zduH2kUwpdooV* zRo~c2z=akkOynC7XgK%G!Kp~!F$06x*YzjC0VxLl*qMh;^Iz&`fR-_bmo=PGCE6xx zNCJCmLLrV5HLZ|)(wk;G)lRoF?V)xSoM;j?r>SO+WticGKLHS-zNUD8j{oM9>BMoa z3ZyxyfQkL zPO(Q0&D8GM7LJ<0y5OKjI>+&Ua^O+tG(79njMe(mi||c>=?=q~Jn_8d^0I^fI71u{ z&T?1{@X1y8KL_-DbGr6CdBBhX%`mU)^@D@9S9djRqk_{fa@wiEUB?Ii02iXZ11WG| z(sF7IGWX%@_n<~|o1eH$F7Vwmr^*t2`3M*USihaFO}CCR1Hi6Ad=gc6t*TpJ2EP`8 zTF-Lm|CD$$g)|>P^;ziTlf!m7Iqj4{$?;)R&uOQSkIY`OUZeHv6>*$nrx?H%#UlRa zle-BA!fsa?O|nf(gJq9(0bws~G(N~H(x?;(9NH#oDOQA?jm-ThgFVFXG1ZmgE*>Pc*iOoVX7DWricC$cH_0eg8Z% zgvN5!U8HQ}12scFnv3x2nyJ+B|FGD(G7{2PsIqaiwd*N06MK5HW2Rijd-)+{f^&dF zu4Q%thkK1JS*ADiKeia|N9gkuJw8f-MZrZ17APQ>OLlVNb0s#)S?h7v`i8CxqQlg) vHCHsN&S_eg9#u?=l!m82q`^v!r4FRVQ~OeB{2fZAGoz_O>Wx$}b>+VSt7f@x diff --git a/utils/__pycache__/queries.cpython-310.pyc b/utils/__pycache__/queries.cpython-310.pyc deleted file mode 100644 index eb47eaa92184637f15d4825f0927a31059d9e014..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10413 zcmeHN&vP3`cAgmwei8&Jiljh_rbd(~5fVs>lGm2K@@8dez1hmju~$wQPbq~Opc~+j zgBh@AK#Itql9V<%MHN?a%x)FAx8%70MowFm%3&@!Y%V(FA8=LH`CiWe1VLG~iBpxz zmO)NW_v`N0uV26J{@w$;$;pC(&%fOHkNW(5Mfo)qMn5SOzK_TMJ0whDs-u{yd~0S( zzI9WV@3fhg?~Iv|@2r`X@0^*#TXXX5f>}_BN6MLK7tNx~>&|4mWR_%Jb*9WImS&ls zD&{oHvK-zQ*abGhia%A&88*pEpj>2AY#Q%brao0FGp`_*@^M9traOYOTGOcs-m1A) zn}e3#?|K1cvR&bL9qu-CYMy-ZcxUiD!s9EdqJ$}??r9>+w7%|UP)eaR$@IR?(k->0 z4po-kONqBbWmgF`mSNeJ#&Rqls_Cg%$B=d%C!nlqac-C%d5Z)9zjL zB|c5!!%Cs{4+@+5d5U4q;CF$|yiD~okRi*`{oE;s5WU%S%;{J?fdrI7!scn4(^1&P zFdga(%7H36l27ib5>wd41?5;{vuy4}J5HTQ$uJJ;EAzbvf8=xH@gM%kuv}(XBCs0{ zHv&6wxNiiWvESw5$ao|?-xs`TG(BNh-CzeV$M%CL-L!=t2&httbjR|8S6|^1DNzb6 zqfFfNm8?!N&#vgSLf{x>eD3fD7&vy@4x&ucb^Nw@3+?ndgym#RU79(rOATpaI^1`)jD zSPdSf{f=V?m3*W%geX$yStMpY7W)&=k5V>!9%bXQFBYga)o_DICwu!xMO+2t%kOML zhr-|V1Kam@Hy?YAZkxM-zsdN)Ce&%UAKLZJZeTn9Ce7aF{zm60nj{|nN5X6W2~X&E)A>S{*IX^ZN#I;(1bkyDpI(benPtXfPVmtOjfu8C`C-XP@;KD2prc%*@U zh)ke_*us~v7Y|hxJ401I$3wJA zx_3}8@S&yex_hp7=o&5VGA^nHR?RUE?O?~KTQDPG1>Bd7kGwAEsC4-O7sjU1u-u!0 zA-b*+?AX4s7VKC7b>{|3f#(@@yG8cq?^uFYB#+)SvjS@~9;-$J^z@C+o)c-J{&6K0 z>9oJ2Ovi5QIULsKSq#+dj@RbQ7LnSFCPwHn0)siGW1=}@o|s7%8Gq!wry zus569yz&AH20aZ*M$)m%N9Z|OkA2GW;E+Mf0#RBDoA`y=&j$K#I?S@-OSPYCDg8X2 z0?w<6AQP%D)PB)c`jdD{piSYK4zj!1P}|J~c}icXZ1Sb{SotFLryuk$&`GqL3ok4v zyM^P46FNC+`=7F@A+?xL9X8#caSOrZX$_lNz)box;Y>Ja^_O9hfmi>ctB1vKhH}I9 z1y;g21`YIQd)aVy7xE!Lx2?GO*xW#ayhQt48RqB5`V-Gf=zl5cUmodyDd}HE|3Q8s zm_EJ`%!G5h7uiKR|6XdxvnP=IL_MAhW{&3>{(?)#Wp)Ygh26!I6#mrX%dE`iaaLYt zD!u!237`3I$Y(y`Q~rj0$_bx^Z^&mMoI74RQI6lj-6E}YP}Q+VxYW8_1_AoS?ZF~ zpAO$*OW4V8vA1NpEYm9sN=xrAhKmF450r2rEGMICg6Oi0QI|0)@?SgZ6^wf2zcK1^ zxIEyFQRl)E=BbINVJVzjP@ZXhqkq+%39f`G)Lnd$iYeigoXu2dgjXRY+M;hc>AR0| zS@vD&UrYLylfIw~`>ur7WZ&`wCD4wI@B%Y1#yp*G$QkS^bYrlU_{0krX~(18Rpe6a z+Ft=NonXhcGT4JFpj^k^yB_9tugP_|F3&xUt+E?fiQJHXIb2BiuSotYL;lM{{>vl$ zZzgic*3|RNc&o$lR?m(XUL|hfHA-id{tcArTpyHg4(mZh{o1fTE?2@E;mvR@th97F z<6-GUy{tUb`#1VG`)mD5m_NRD+VTV^f2^fY&INocIMynKC!!5Lq-1QKvgfnp!XT>_x<@#O^uZ7D)S}LKT7J6AH?O^@Z z!%Vo2U6KhWq5dF-x}UZ*;xhU+doeyw_`P&enY*sdn3#|on{PogI?kR+KBHWF>~%t?QjRqfeD7T<(fI5 z%Z}yR4OvwO%-`#!SN-a$A7$eCMLDa{=wgCLAHO0P_gO_Xi?J9r8?g3yPha&{{fZ_C z*u`qR_^E{LY^b7+ZS+cTlK|ii4=d+2jb08b6cD)YT^=+iHE!}?2lCj#kzt`5rm9=d zZT}Sm_sUJNgjbaA2pe97=60fVkHguDv<{2%kAZ(bf?4tyhw8yTYe(5+lqlr@{$@My zJYXe}zQ>O+$EM94Mt%#}O-N&vuCX5#$S#xoB+RQLJgmPD+la~y&jm^kehm2j{Aq_b zY|D8B91b?{?XWU#!(%+kFfQNuKv=F1M9ian+uGx@2vE+^yjzE72PJ^(M5pUEg03_l zahX)PD5px*9YI?mn#79Kpp;td6Q(U8tRv#mHsPbe73g48s{@$NqBPV@zFQLm2bj3t zhF?|L6jzAz8WJ;Gx9j9lM1^>vYtV?PH$7+~)5aorESkbHNP%i?3t~JMZ&Sxy2Lp*3 zYl;u3sMD&6W#UWzm|1Fe9Y<=eMh=NNb*k2wFzJ#bAt!;=H%o4fI1I~a$zHqzUXj+S zM_SV|b4{xzZCU1Gvc0RL9E&)K6nW_G9z^b)P0Ds6x>RJUDPMcHp$;Fe%!mV69gUq{msPpUdI;Q zXn1WDgWtoZ`kJAra@~m_|UCIWw}0Iz?d1F?Jx(YUJFzLMdYxY zXfIU2o0zUAbajlLPUzYg9d5nkL#sNPPKZlrpehxAH{2XHX`8mY&^K*#l=jV_Qb74NQQR}Y(BYwggxC{*n?isdg z_!Mih8~`S|1L;l%#vqtu>{z~GGqkl)hcG9iO+;~x!F>Q2WC3pBMl}3xV+U{v;VF6c z0e>MvhcOV+lhDWmJ`+Y8;UGbGy5HFG9Nd)#eGt1sj0~Z%buh+2f>Yq=00C7`!aBFx zuA?(aMx!JHKn}uuQfy;wYyG{-hVlCVXaBpE|T;gi4^{B$%qv>XD#JWsT#7R+%s0*?asRfnNIx@D@y5X)3Y zbkT%dfNmScM-&XMlGt87uxu9^7y+{=#W&HZ2xv~iNnWG48B%Zy3L4JJs2>?!IZ?W@ zC>DsQA+3>*k)d7*nZ3GIw-L~lbF`pOVowK^QRk$i2Ji(fKX_TRpq6W_eWz+{-GRP8 zJOUHod7ui3vPr5n3W@XQ4X4YfJ0jLiT2~1mWNp?F$IPg-3dx}#P# zT#UALM6O7chM;W-8^t2QtwDg-g)0+p1FUOe0`YDnOpIac6~l)2W4Ywa0G&&}h~mva zv9MbHyb9j#dfj#{aRem_Lt@T^Rfl^Whi^zVJq1K{*;x}zd@0NUHjL>dfE?omyyZdC z;f{FCN>b%{Q~d3Xkfe$YacCl6W05#r;xAS(A{;49a&2o1)_M0e*7-)u{CYEc_;7V( z5E3mJD6zUR0S5NBjc(G(N;+?jwW}HrVVz^d&CwpQb#0^7DD_N}vo=ezy}@p!)qE7k z;*CFsn<$LOFhw#_@<>Kc9qB?R)_L9<&sxO5C}VM9l~Rn?TIP>n)UZFV>q}YM_H)kG z>5NSDfDJ`vy5r$%k;5Gl>&U=43D@6l%9A>vjYiiHHW=#;ZwRZ6FG9M+HWtn9$>WBY zC1>w|Adhi72?fN~Z)|e|4*wn<<%gD#o3`0S3@J5HJKX7DW0S_}91azI zw}T7C56Db+yhEz+yPb|F0)y{&aq89aUCrq&iEW--t{bNIBIWcoO?a zOzq(|s1NaeX#EVB>UZ(RenxlwpW$0fgl`<@?wzfB@xjDdmBIQzxIZK`EPV4QKzet- zYXOeXWpaRH67bk|ZENqechz`IYi6VW-8&n1wl?nE+qnPU+xOnRPlu`qJOFJEbz8BW z;K;Mw7L|u%0UVI4EOt3RrQ@3KItRAwX}9b6Vhi}F|G$nmzJ(jFg&VP{eC@*acDycp za);Mw(QEhKdH3$t)*Y!A&aC?@f%Rg8P<~R8H_T3 zG2@&@*?x;-B>MV)h+`^x9H#yWp%;q%m>COSskp3C5W7U%qr{$g?2jp~J{FmxU$4QtsSO+Xzea!m1`=H@mne98`loB-l$<8|Xo@c>s(6WY z%FiJ{+--M`r0n=z43M1Viyu%uu@yh0p}(+VzZH1p=Svokw8o|(`_qlRGp z{Kuo6PbvugX2J1A09*l53&0p+_z`L&MwkO%%-k*1^dc8jFIE)HW2f*Th6?kF;FX6Qtl8|;(*Zt& z2rP#P|MeDjD)(}C3GHI}tM!vEPcb{SgboN>U~e6g1Lv@e3vLM@JByRGJBo+vn>Rz6 zvXDwGqL_zT=$NZeXW@epmwVwgnJFcCC+uW0q$AyfPb`!+UPnl!pFtv#abpVN->?ia z4!AV_^yo8e)eg%<(=}e{(fEqTJOT%?NQ5?iN5q;-Q%Sf|w96+AooO0>B)J;JS{YAh zo~TKs4D7Qr&+!@KmGU!NA5BNvkDRfrZZGq?Vwa}|uRh12y>y((0eFeCvAsm{VN4@# zJT;7kZdMJ6q_m|W=YX}V)39%4$~b~OG?ghw&!SPnQ>|Kz@3tVFH2qxcv_@LQs%2jQ|;MB42EE5^R$;cYLSoYlkP+8g=pwXz08P3sS*$|IOfdwjv zh7!ne-|c&ccpcsUr2u4c4u!vj`hMZUi3)@Ae}r;4wT|}jAv8j{>%Xz<>i|m3V-;(= zkB?9=a4dQU6_ul%>VN11(EG31+je?oUp>;lvVUD)%PV=1SMyre%}MU8p$BJ=@37h$ z($yTXnYW=qfg1bdyRewiwSyTr58r79( zZF1|L@poxF;;K!qUN;_o>mGy1jkto6Xys z*EYXu)=j0CC7cP_t|U~X+pJ?|jzwbJJx*oY*@J!3p@zh??RMyPQu1*drlysFj}7j; zk$rgy9vcFwWnjpgcU(MQPJ~4g;;^K7GD{Y4d2*vUoeL#6=IUh@O|X_^Y!vg$<+!m` H60H0KNH*8r From b2956e921c6af62355c5cff434735cd53b42a358 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 22:51:54 -0500 Subject: [PATCH 45/47] update for tests --- cleanBib.ipynb | 462 ++++------------------------------------- utils/preprocessing.py | 33 +-- utils/queries.py | 219 ++++++++++++++++--- 3 files changed, 245 insertions(+), 469 deletions(-) diff --git a/cleanBib.ipynb b/cleanBib.ipynb index 60d6ac3..1d1473e 100644 --- a/cleanBib.ipynb +++ b/cleanBib.ipynb @@ -43,19 +43,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "kernel": "Python 3" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No optional .tex file found.\n" - ] - } - ], + "outputs": [], "source": [ "import glob\n", "from habanero import Crossref\n", @@ -66,11 +58,16 @@ "sys.path.insert(1, f'{wd.absolute()}/utils')\n", "from preprocessing import *\n", "from ethnicolr import pred_fl_reg_name\n", + "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n", "import tensorflow as tf\n", "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)\n", + "import warnings\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "cr = Crossref()\n", - "homedir = '/Users/stisoj/Documents/dev/cleanBib/tests/immaculate/'\n", + "homedir = '/home/jovyan/'\n", "bib_files = glob.glob(homedir + '*.bib')\n", "paper_aux_file = glob.glob(homedir + '*.aux')\n", "paper_bib_file = 'library_paper.bib'\n", @@ -112,40 +109,31 @@ "checkingPublishedArticle = True\n", "```\n", "\n", - "Then, run the code block below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)" + "Then, run the code block below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)\n", + "\n", + "__NOTE__: Please edit your .bib file using information printed by the code and provided in cleanedBib.csv. Edit directly within the Binder environment by clicking the .bib file (as shown below), making modifications, and saving the file (as shown below).\n", + "\n", + "![open button](img/openBib.png)\n", + "\n", + "![save button](img/saveBib.png)\n", + "\n", + "Common issues include:\n", + "\n", + "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n", + "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n", + "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "kernel": "Python 3" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1: buzsaki2013memory\n", - "2: Lundine2019\t\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\n", - "3: zurn2020network\n", - "4: moralia2005\n", - "5: bassett2022curious\n", - "6: fake2022 <-- self-citation\n", - "7: jurafsky2018n\t\t <-- ***NAME MISSING OR POSSIBLY INCOMPLETE***\n", - "8: mitchell2013gendered\n", - "9: chatterjee2021gender\n", - "10: fulvio2021imbalance\n", - "11: ethnicolr2022black\n", - "12: ethnicolr2022hispanic\n", - "13: ethnicolr2022asian\n", - "14: ethnicolr2022white\n" - ] - } - ], + "outputs": [], "source": [ - "yourFirstAuthor = 'Stiso, Jennifer'\n", - "yourLastAuthor = 'Zhou, Dale'\n", + "yourFirstAuthor = 'LastName, FirstName'\n", + "yourLastAuthor = 'LastName, FirstName'\n", "optionalEqualContributors = ['LastName, FirstName OptionalMiddleInitial', 'LastName, FirstName OptionalMiddleInitial']\n", "checkingPublishedArticle = False\n", "\n", @@ -159,37 +147,14 @@ " # find and print duplicates\n", " bib_data = get_duplicates(bib_data, bib_files[0])\n", " # get names, remove CDS, find self cites\n", - " get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "kernel": "R" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "STOP: Please remove self-citations. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n", - "['fake2022']\n", - "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. Here are some suggestions to check for with the following citation keys in your .bib file: \n", - "['Lundine2019', 'jurafsky2018n']\n", - "Only continue if you've run steps 2, and this code no longer returns errors.\n" - ] - } - ], - "source": [ + " get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualContributors, cr)\n", + " \n", "bib_check(homedir)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3. Estimate gender and race of authors from cleaned bibliography\n", "\n", @@ -200,18 +165,6 @@ "\n", "[You can find your key in your account's profile page.](https://gender-api.com/en/account/overview#my-api-key)\n", "\n", - "__NOTE__: Please edit your .bib file using information printed by the code and provided in cleanedBib.csv. Edit directly within the Binder environment by clicking the .bib file (as shown below), making modifications, and saving the file (as shown below).\n", - "\n", - "![open button](img/openBib.png)\n", - "\n", - "![save button](img/saveBib.png)\n", - "\n", - "Common issues include:\n", - "\n", - "* Bibliography entry did not include a last author because the author list was truncated by \"and Others\" or \"et al.\"\n", - "* Some older journals articles only provide first initial and not full first names, in which case you will need to go digging via Google to identify that person.\n", - "* In rare cases where the author cannot be identified even after searching by hand, replace the first name with \"UNKNOWNNAMES\" so that the classifier will estimate the gender as unknown.\n", - "\n", "__NOTE__: your free account has 500 queries per month. This box contains the code that will use your limited API credits/queries if it runs without error. Re-running all code repeatedly will repeatedly use these credits.\n", "\n", "Then, run the code blocks below. (click to select the block and then press Ctrl+Enter; or click the block and press the Run button in the top menubar)" @@ -219,38 +172,16 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "kernel": "R" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Remaining credits: 262\n", - "This should use (at most) 25 credits, saving you approx 1 credit(s) by storing queries.\n" - ] - } - ], + "outputs": [], "source": [ "genderAPI_key = '&key='\n", "\n", - "# The following saves the api key to a txt file just to be reloaded by the next cell\n", - "with open(\"genderAPIkey.txt\", 'w') as f:\n", - " f.write(genderAPI_key)\n", - "\n", "# Check your credit balance\n", - "authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')\n", - "url = \"https://gender-api.com/get-stats?key=\" + genderAPI_key\n", - "response = urlopen(url)\n", - "decoded = response.read().decode('utf-8')\n", - "decoded_json = json.loads(decoded)\n", - "print('Remaining credits: %s'%decoded_json[\"remaining_requests\"])\n", - "print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \\\n", - " 'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-\n", - " (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \\\n", - " ' credit(s) by storing queries.')" + "check_genderAPI_balance(genderAPI_key, homedir)" ] }, { @@ -268,222 +199,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "kernel": "Python 3" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "first author is Jennifer Stiso \n", - "last author is Dale Zhou \n", - "we don't count these, but check the predictions file to ensure your names did not slip through!\n", - "looping through your references, predicting gender and race\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/23 [00:00 0: - print("In your .bib file, we found and removed duplicate entries for the following entries:\n " + + print("\n In your .bib file, we found and removed duplicate entries for the following entries:\n " + ' '.join(map(str, duplicates)) + - "\n If this is incorrect, please edit you .bib file to give unique identifiers for all unique references:") + "\n If this is incorrect, please edit your .bib file to give unique identifiers for all unique references. \n") + if len(duplicates) > 0: # write new data to file - new_bib = filename[:-4] + '_clean.bib' + new_bib = filename[:-4] + '_noDuplicates.bib' with open(new_bib, 'w') as bibtex_file: bibtexparser.dump(bib_data, bibtex_file) @@ -312,6 +313,7 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC for key in bib_data.entries.keys(): diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists', 'The gender citation gap in international relations', + 'Gendered citation patterns in international relations journals', 'Quantitative evaluation of gender bias in astronomical publications from citation counts', '\# CommunicationSoWhite', '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}', @@ -321,7 +323,9 @@ def get_names(homedir, bib_data, yourFirstAuthor, yourLastAuthor, optionalEqualC 'Gender Diversity Statement and Code Notebook v1.1', 'Gendered citation practices in the field of communication', 'Gender disparity in citations in high- impact journal articles', + 'Gender Disparity in Citations in High-Impact Journal Articles', 'Gender (im)balance in citation practices in cognitive neuroscience', + 'Gender (Im)balance in Citation Practices in Cognitive Neuroscience', 'Name-ethnicity classification from open sources', 'Predicting race and ethnicity from the sequence of characters in a name'] if bib_data.entries[key].fields['title'] in diversity_bib_titles: @@ -473,13 +477,16 @@ def self_cites(author, yourFirstAuthor, yourLastAuthor, optionalEqualContributor def bib_check(homedir): # Do a final check on the bibliography entries + authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv') + skip_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey']) + with open(os.path.join(homedir, 'cleanedBib.csv')) as csvfile: names_csv = csv.reader(csvfile) names_db = [] for row in names_csv: names_db.append(row) - incomplete_name_bib_keys, self_cite_bib_keys = [[], []] + incomplete_name_bib_keys = [] authors_full_list = [] for row in names_db[1:]: # Skip the first row, it's just headers # Check that the authors' names have at least 2 characters and no periods @@ -487,24 +494,18 @@ def bib_check(homedir): authors_full_list.append(first_author) # For counting the number of query calls needed authors_full_list.append(last_author) if len(first_author) < 2 or len(last_author) < 2 or '.' in first_author + last_author: - incomplete_name_bib_keys.append(bib_key) - if self_cite == 'Y': - self_cite_bib_keys.append(bib_key) - - if len(self_cite_bib_keys) > 0: - warning_message = "STOP: Please remove self-citations. Then, re-run step 2. " - warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: " - print(warning_message) - print(self_cite_bib_keys) + if bib_key not in skip_selfCites: + incomplete_name_bib_keys.append(bib_key) if len(incomplete_name_bib_keys) > 0: - warning_message = "STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. " + warning_message = "\n STOP: Please revise incomplete full first names or empty cells. Then, re-run step 2. " warning_message += "Here are some suggestions to check for with the following citation keys in your .bib file: " print(warning_message) print(incomplete_name_bib_keys) - final_warning_message = "Only continue if you've run steps 2," - final_warning_message += " and this code no longer returns errors." + final_warning_message = "\n Only continue if you've run step 2," + final_warning_message += " and this code no longer returns error or instructions to revise the .bib file." + print("\n") print(final_warning_message) diff --git a/utils/queries.py b/utils/queries.py index 4f7ff41..d09f615 100644 --- a/utils/queries.py +++ b/utils/queries.py @@ -9,6 +9,8 @@ from urllib.parse import quote from urllib.request import urlopen import json +import matplotlib.pyplot as plt +import seaborn as sns def namesFromXref(cr, doi, title, authorPos): '''Use DOI and article titles to query Crossref for author list''' @@ -75,6 +77,27 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho race = [] idx = 0 + # skip self-citations + authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv') + skip_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey']) + # skip citation diversity statement papers + diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists', + 'The gender citation gap in international relations', + 'Gendered citation patterns in international relations journals', + 'Quantitative evaluation of gender bias in astronomical publications from citation counts', + '\# CommunicationSoWhite', + '{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}', + 'Gendered citation patterns across political science and social science methodology fields', + 'Gender Diversity Statement and Code Notebook v1.0', + 'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender', + 'Gender Diversity Statement and Code Notebook v1.1', + 'Gendered citation practices in the field of communication', + 'Gender disparity in citations in high- impact journal articles', + 'Gender Disparity in Citations in High-Impact Journal Articles', + 'Gender (im)balance in citation practices in cognitive neuroscience', + 'Gender (Im)balance in Citation Practices in Cognitive Neuroscience', + 'Name-ethnicity classification from open sources', + 'Predicting race and ethnicity from the sequence of characters in a name'] # save base gender rates gender_base = get_gender_base(homedir) # make a dictionary of names so we don't query the same thing twice @@ -83,6 +106,10 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho n_gen_queries = 0 n_race_queries = 0 for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)): + if paper in skip_selfCites: + continue + if bibfile.entries[paper].fields['title'] in diversity_bib_titles: + continue if 'author' not in bibfile.entries[paper].persons.keys(): continue # some editorials have no authors if 'year' not in bibfile.entries[paper].fields.keys(): @@ -209,7 +236,7 @@ def get_pred_demos(authors, homedir, bibfile, gender_key, font='Palatino', metho mm, wm, mw, ww = np.mean(gender, axis=0) * 100 WW, aw, wa, aa = np.mean(race, axis=0) * 100 - return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix + return mm, wm, mw, ww, WW, aw, wa, aa, citation_matrix, paper_df def gen_api_query(gender_key, name, gb): url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(name)) @@ -225,51 +252,177 @@ def gen_api_query(gender_key, name, gb): return gender, g def print_statements(mm, wm, mw, ww, WW, aw, wa, aa): - statement = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \ - are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the \ - diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \ - and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure \ - (and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), \ - MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \ - databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \ - Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \ - first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of \ - color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method \ - is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \ - it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \ - We look forward to future work that could help us to better understand how to support equitable practices in science." + statement = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars " + "are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the " + "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first " + "and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure " + "and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), " + "MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the " + "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. " + "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a " + "first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of " + "color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method " + "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) " + "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. " + "We look forward to future work that could help us to better understand how to support equitable practices in science.") statement = statement.replace('MM', str(np.around(mm, 2))) statement = statement.replace('WM', str(np.around(wm, 2))) statement = statement.replace('MW', str(np.around(mw, 2))) statement = statement.replace('ww', str(np.around(ww, 2))) - statement = statement.replace('WW', str(np.around(WW, 2))) - statement = statement.replace('AW', str(np.around(aw, 2))) - statement = statement.replace('WA', str(np.around(wa, 2))) + statement = statement.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) + statement = statement.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) + statement = statement.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) statement = statement.replace('AA', str(np.around(aa, 2))) - statementLatex = "Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars \ - are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the\ - diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first \ - and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure \ - (and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), \ - MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the \ - databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. \ - Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a \ - first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of \ - color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method \ - is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) \ - it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. \ - We look forward to future work that could help us to better understand how to support equitable practices in science." + statementLatex = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars " + "are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the " + "diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first " + "and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure " + "(and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), " + "MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the " + "databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. " + "Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a " + "first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of " + "color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method " + "is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) " + "it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. " + "We look forward to future work that could help us to better understand how to support equitable practices in science.") statementLatex = statementLatex.replace('MM', str(np.around(mm, 2))) statementLatex = statementLatex.replace('WM', str(np.around(wm, 2))) statementLatex = statementLatex.replace('MW', str(np.around(mw, 2))) statementLatex = statementLatex.replace('ww', str(np.around(ww, 2))) - statementLatex = statementLatex.replace('WW', str(np.around(WW, 2))) - statementLatex = statementLatex.replace('AW', str(np.around(aw, 2))) - statementLatex = statementLatex.replace('WA', str(np.around(wa, 2))) + statementLatex = statementLatex.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) + statementLatex = statementLatex.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) + statementLatex = statementLatex.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x})) statementLatex = statementLatex.replace('AA', str(np.around(aa, 2))) return statement, statementLatex +def plot_heatmaps(citation_matrix, homedir): + cmap = sns.diverging_palette(220, 10, as_cmap=True) + names = ['white_m','api_m','hispanic_m','black_m','white_w','api_w','hispanic_w','black_w'] + plt.close() + sns.set(style='white') + fig, axes = plt.subplots(ncols=2,nrows=1,figsize=(7.5,4)) + axes = axes.flatten() + plt.sca(axes[0]) + heat = sns.heatmap(np.around((citation_matrix/citation_matrix.sum())*100,2),annot=True,ax=axes[0],annot_kws={"size": 8},cmap=cmap,vmax=1,vmin=0) + axes[0].set_ylabel('first author',labelpad=0) + heat.set_yticklabels(names,rotation=0) + axes[0].set_xlabel('last author',labelpad=1) + heat.set_xticklabels(names,rotation=90) + heat.set_title('percentage of citations') + + citation_matrix_sum = citation_matrix / np.sum(citation_matrix) + + expected = np.load('/%s/data/expected_matrix_florida.npy'%(homedir)) + expected = expected/np.sum(expected) + + percent_overunder = np.ceil( ((citation_matrix_sum - expected) / expected)*100) + plt.sca(axes[1]) + heat = sns.heatmap(np.around(percent_overunder,2),annot=True,ax=axes[1],fmt='g',annot_kws={"size": 8},vmax=50,vmin=-50,cmap=cmap) + axes[1].set_ylabel('',labelpad=0) + heat.set_yticklabels('') + axes[1].set_xlabel('last author',labelpad=1) + heat.set_xticklabels(names,rotation=90) + heat.set_title('percentage over/under-citations') + plt.tight_layout() + + plt.savefig('/home/jovyan/race_gender_citations.pdf') + +def plot_histograms(): + # Plot a histogram # + names = pd.read_csv('/home/jovyan/predictions.csv') + total_citations = names.CitationKey.nunique() + names.GendCat = names.GendCat.str.replace('female', 'W', regex=False) + names.GendCat = names.GendCat.str.replace('male', 'M', regex=False) + names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False) + gend_cats = names['GendCat'].dropna().unique() # get a vector of all the gender categories in your paper + + # Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column # + dat_for_plot = names.groupby('GendCat').size().reset_index() + all_cats = ['MU', 'WW', 'UM', 'MW', 'WM', 'UW', 'MM'] + empty_dat_for_plot = pd.DataFrame(0, index=np.arange(7), columns=['GendCat', 0]) + empty_dat_for_plot['GendCat'] = all_cats + set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat']) + for i in set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat']): + empty_dat_for_plot.loc[empty_dat_for_plot['GendCat'] == i, 0] = dat_for_plot.loc[dat_for_plot['GendCat']== i, 0].values + dat_for_plot = empty_dat_for_plot + dat_for_plot.rename(columns={0:'count'}, inplace=True) + dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100) + + # Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks + dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'WW') | + (dat_for_plot.GendCat == 'MW') | + (dat_for_plot.GendCat == 'WM') | + (dat_for_plot.GendCat == 'MM'),:] + # MM,MW,WM,WW + # 58.4% for man/man, 9.4% for man/woman, 25.5% for woman/man, and 6.7% for woman/woman + baserate = [6.7, 9.4, 25.5, 58.4] + dat_for_baserate_plot['baserate'] = baserate + dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate= + dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate + ) + + # plot + plt.figure() + sns.barplot(data=dat_for_plot, x='GendCat', y='count', order=np.flip(gend_cats)) + plt.xlabel('Predicted gender category') + plt.ylabel('Number of papers') + plt.tight_layout() + + plt.figure() + sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='citation_rel_to_baserate', order=['WW','WM','MW','MM']) + plt.xlabel('Predicted gender category') + plt.ylabel('% of citations relative to benchmarks') + plt.tight_layout() + + +def check_genderAPI_balance(genderAPI_key, homedir): + authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv') + authors_full_list = authors_full_list.loc[authors_full_list['SelfCite'] == 'N'] + + url = "https://gender-api.com/get-stats?key=" + genderAPI_key + response = urlopen(url) + decoded = response.read().decode('utf-8') + decoded_json = json.loads(decoded) + print('Remaining credits: %s'%decoded_json["remaining_requests"]) + print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \ + 'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())- + (authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \ + ' credit(s) by storing queries.') + + +def colorful_latex(paper_df, homedir, tex_file): + cite_gender = paper_df[1::2] + cite_gender.GendCat = cite_gender.GendCat.str.replace('female', 'W', regex=False) + cite_gender.GendCat = cite_gender.GendCat.str.replace('male', 'M', regex=False) + cite_gender.GendCat = cite_gender.GendCat.str.replace('unknown', 'U', regex=False) + cite_gender.index = cite_gender.CitationKey + cite_gender['Color'] = '' # what color to make each gender category + colors = {'MM':'red','MW':'blue','WW':'green','WM':'magenta','UU':'black', + 'MU':'black','UM':'black','UW':'black','WU':'black'} + for idx in cite_gender.index: # loop through each citation key and set color + cite_gender.loc[idx,'Color'] = colors[cite_gender.loc[idx,'GendCat']] + + fin = open(homedir+tex_file) + texdoc=fin.readlines() + with open(homedir+tex_file[:-4]+'_gendercolor.tex','w') as fout: + for i in range(len(texdoc)): + s = texdoc[i] + cite_instances = re.findall('\\\\cite\{.*?\}',s) + cite_keys = re.findall('\\\\cite\{(.*?)\}',s) + cite_keys = [x.split(',') for x in cite_keys] + cite_keys_sub = [['\\textcolor{' + cite_gender.loc[x.strip(),'Color'] + '}{\\cite{'+x.strip()+'}}' for x in cite_instance] for cite_instance in cite_keys] + cite_keys_sub = ['\\textsuperscript{,}'.join(x) for x in cite_keys_sub] + for idx,cite_instance in enumerate(cite_instances): + s = s.replace(cite_instances[idx],cite_keys_sub[idx]) + fout.write(s) + # place color key after abstract + if '\\section*{Introduction}\n' in s: + l = ['\\textcolor{' + colors[k] + '}{'+k+'}' for k in colors.keys()] + fout.write('\tKey: '+ ', '.join(l)+'.\n') + + From 14c9dc7a0d3582d607dbbfd4e7ad10356505b118 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 23:26:28 -0500 Subject: [PATCH 46/47] update README --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index d73aecd..768dbf6 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ And editorials and research highlights of this work: * Z. Budrikis (2020). Growing citation gender gap. *Nature Reviews Physics*. [doi: https://doi.org/10.1038/s42254-020-0207-3](https://doi.org/10.1038/s42254-020-0207-3) * D. J. Sweet (2021). New at cell press: the inclusion and diversity statement. *Cell*, 184(1), 1-2. [doi: https://doi.org/10.1016/j.cell.2020.12.019](https://www.sciencedirect.com/science/article/pii/S0092867420316895?via%3Dihub) * B. Rowson, S.M. Duma, M.R. King, I. Efimov, A. Saterbak, and N.C. Chesler (2021). Citation diversity statement in BMES journals. *Annals of Biomedical Engineering*, 1-3. [doi: https://doi.org/10.1007/s10439-021-02739-6](https://link.springer.com/article/10.1007/s10439-021-02739-6) +* D. Kwon (2022). The rise of citational justice: how scholars are making references fairer. *Nature*, 603(7902), 568-571. [doi: https://doi.org/10.1038/d41586-022-00793-1](https://www.nature.com/articles/d41586-022-00793-1) For `.pdf` and `.tex` templates of the statement, see the `/diversityStatement` directory in this repository. @@ -333,6 +334,7 @@ ___ * Christopher Camp * Eli Cornblath * Jordan Dworkin +* Kieran Murphy * Jordan Matelsky * Cleanthis Michael * Kendra Oudyk @@ -344,6 +346,15 @@ ___ * Dale Zhou # Changelog +* __11/28/2022__ + * major refactor (thanks, Jeni!) + * removed SOS notebook + * upgraded all packages and libraries + * all R code now in Python + * majority of code now call functions located in utils/ + * automate removal of identified duplicates and self-citations + * query how many credits left and save some by only querying unique names (thanks, Kieran!) + * fixes issue introduced by [protobuf upgrade](https://github.com/protocolbuffers/protobuf/issues/10051) * __9/14/2021__ * force Binder to load with [classic theme](https://discourse.jupyter.org/t/mybinder-org-using-jupyterlab-by-default/10715) because new default breaks SOS notebook R code From d6f2b8b4f267de39bb69357bcc3c704ca58d4980 Mon Sep 17 00:00:00 2001 From: Dale Zhou Date: Mon, 28 Nov 2022 23:30:09 -0500 Subject: [PATCH 47/47] update launch binder link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 768dbf6..590897f 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ ___ 2. Launch the coding environment. Please refresh the page if the Binder does not load after 5-10 mins. - [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dalejn/cleanBib/34b3a896b6fe0961b2dfc3ad22214cf45da48cca?urlpath=/tree/) + [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dalejn/cleanBib/refactor?urlpath=/tree/) 3. Open the notebook `cleanBib.ipynb`. Follow the instructions above each code block. It can take 10 minutes to 1 hour complete all of the instructions, depending on the state and size of your `.bib` file. We expect that the most time-consuming step will be manually modifying the `.bib` file to find missing author names, fill incomplete entries, and fix formatting errors. These problems arise because automated methods of reference mangagers and Google Scholar sometimes can not retrieve full information, for example if some journals only provide an author's first initial instead of their full first name.