Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

flower1430 · 2024-03-27T07:21:12Z

In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )

!python /content/semanticClimate/keyword_extraction/code/keyword.py
--html_path /content/remote_agri/eupmc_result.html
--saving_path /content/
--method 'rake'

keyword.py

"""Untitled56.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr
"""

from bs4 import BeautifulSoup
from keybert import KeyBERT
from multi_rake import Rake
from summa import keywords
import yake
from IPython.display import HTML
import pandas as pd
import requests
import os
import argparse
import spacy
nlp = spacy.load("en_core_web_lg")


class keyword_extraction():
  def __init__(self,html_path, saving_path, method):
    self.html_path = html_path
    self.saving_path = saving_path
    self.method = method
    self.text = ''
    self.span_list = []

  def extract_span_list(self):
    with open(self.html_path, 'r') as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
      with open('/content/html_ex.html','w', encoding="utf-8")as file:
       file.write(soup.prettify())
      # kill all script and style elements
     
      soup_elem = soup.find_all("span")
      for span_elem in soup_elem:
        #print(span_elem)
        span_elem.extract() 
        span_text = span_elem.get_text().strip()
        lines = (line.strip() for line in span_text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  ") if len(phrase.strip())>9 )
        # drop blank lines
        #text_write = '\n'.join(chunk for chunk in chunks if chunk)
        span_text = ' '.join(chunk for chunk in chunks if chunk)
        if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text:
          # print(span_text)
          # print('-'*50)
          self.span_list.append(span_text)
    return self.span_list      
  #def tf_idf(self):
  
  def clean(self,df):
      def tagger(x):
         return nlp(x)[0].pos_

      def lemma(x):
        #print(nlp(x)[0].lemma_)  
        return nlp(x)[0].lemma_ 
      
      df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x))
      df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x))
      df= df[df['keyword/phrase'] == df['Lemma'] ]
      df = df.drop_duplicates(subset=['score'], keep='last')
      df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])]
      df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])]
      df = df.drop(columns = ['Lemma'], axis = 0)
      return df


  def extract_text_fom_html(self):

    with open(self.html_path, 'r', encoding="utf-8") as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
     
      for script in soup(["script", "style"]):
          script.extract()    # rip it out
      
      # get text
      text = soup.get_text()
      #print(text)
      # break into lines and remove leading and trailing space on each
      lines = (line.strip() for line in text.splitlines())
      # break multi-headlines into a line each
      chunks = (phrase.strip() for line in lines for phrase in line.split("      ") if len(phrase.strip())>9 )
      # drop blank lines
      #text_write = '\n'.join(chunk for chunk in chunks if chunk)
      text = '\n '.join(chunk for chunk in chunks if chunk)
      self.text = text
      #print(text)
      # TEXT_ = f'Chapter06_text.txt'
      # saving_path = '/content/'     
      with open('text.txt', 'w', encoding="utf-8") as file:
          file.write(text)
      return self.text
  def extract_keywords_rake(self):
    rake = Rake()
    self.extract_text_fom_html()
    keywords_Rake = rake.apply(self.text)
    df_Rake =pd.DataFrame(keywords_Rake)
    df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_Rake = self.clean(df_Rake)
    df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)

  def extract_keywords_gensim(self):
    self.extract_text_fom_html()
    keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters 
    df_gensim =pd.DataFrame(keywords_gensim)
    df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_gensim = self.clean(df_gensim)
    df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)  

  def extract_keywords_yake(self):
    self.extract_text_fom_html()
    kw_extractor = yake.KeywordExtractor(top=100, stopwords=None)
    keywords_yake = kw_extractor.extract_keywords(self.text)
    df_yake =pd.DataFrame(keywords_yake)
    df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_yake = self.clean(df_yake)
    df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None) 
    # for kw, v in keywords_yake:
    #   print("Keyphrase: ",kw, ": score", v)  

  def extract_keywords_textrank(self):
    self.extract_text_fom_html()
    keywords_textrank = keywords.keywords(self.text, scores=True)
    df_textrank = pd.DataFrame(keywords_textrank)
    df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_textrank = self.clean(df_textrank)
    df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)    

  def extract_keywords_keyBERT(self):
    kw_model = KeyBERT(model='all-mpnet-base-v2')
    keywords_keyBERT = kw_model.extract_keywords(self.text, 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=10)  
    
  def main(self):
    if method == 'rake':
      self.extract_keywords_rake()
    elif method == 'yake':  
      self.extract_keywords_yake()
    elif method == 'gensim':  
      self.extract_keywords_gensim()
    elif method == 'textrank':  
      self.extract_keywords_textrank() 
    elif method == 'keyBERT':  
      self.extract_keywords_keyBERT() 


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--html_path',
                      required=True,
                      help='give the path where your html lives: /...')
    parser.add_argument('--saving_path',
                      required=True,
                      help='path of the folder where you want to save the files : /...'
                      )
    parser.add_argument('--method',
                      required=True,  choices=['rake','yake','gensim','keyBERT','textrank'],
                      help='which method you want to us to extact keywords /...')
    

    args = parser.parse_args()

    html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html'
    saving_path = args.saving_path  #'/content/'
    method = args.method
    
    keyword_extractions = keyword_extraction(html_path,saving_path,method)
    keyword_extractions.main()```

The text was updated successfully, but these errors were encountered:

petermr · 2024-03-27T11:29:49Z

Procedural points: * the Googledoc requires permission. Can the content be posted here? * it's not clear what the Issue is: ``` In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not ) ``` "it isn't working" is much too imprecise: * what is the issue? - is Gensim installed? linked? - what is the code reproduced in the Issue. Is it part of `docanalysis` or is it yours? - what were you trying to do? - what did you do? (give input) - what happened (give output in formatted (txt) form) - what you think the problem is - how you think it might be solved (if you know) Giving a simple runnable example is highly desirable:

…

On Wed, Mar 27, 2024 at 7:21 AM flower1430 ***@***.***> wrote: In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not ) !python /content/semanticClimate/keyword_extraction/code/keyword.py --html_path /content/remote_agri/eupmc_result.html --saving_path /content/ --method 'rake' *keyword.py* """Untitled56.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr """ from bs4 import BeautifulSoup from keybert import KeyBERT from multi_rake import Rake from summa import keywords import yake from IPython.display import HTML import pandas as pd import requests import os import argparse import spacy nlp = spacy.load("en_core_web_lg") class keyword_extraction(): def __init__(self,html_path, saving_path, method): self.html_path = html_path self.saving_path = saving_path self.method = method self.text = '' self.span_list = [] def extract_span_list(self): with open(self.html_path, 'r') as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") with open('/content/html_ex.html','w', encoding="utf-8")as file: file.write(soup.prettify()) # kill all script and style elements soup_elem = soup.find_all("span") for span_elem in soup_elem: #print(span_elem) span_elem.extract() span_text = span_elem.get_text().strip() lines = (line.strip() for line in span_text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ") if len(phrase.strip())>9 ) # drop blank lines #text_write = '\n'.join(chunk for chunk in chunks if chunk) span_text = ' '.join(chunk for chunk in chunks if chunk) if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text: # print(span_text) # print('-'*50) self.span_list.append(span_text) return self.span_list #def tf_idf(self): def clean(self,df): def tagger(x): return nlp(x)[0].pos_ def lemma(x): #print(nlp(x)[0].lemma_) return nlp(x)[0].lemma_ df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x)) df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x)) df= df[df['keyword/phrase'] == df['Lemma'] ] df = df.drop_duplicates(subset=['score'], keep='last') df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])] df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])] df = df.drop(columns = ['Lemma'], axis = 0) return df def extract_text_fom_html(self): with open(self.html_path, 'r', encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() #print(text) # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ") if len(phrase.strip())>9 ) # drop blank lines #text_write = '\n'.join(chunk for chunk in chunks if chunk) text = '\n '.join(chunk for chunk in chunks if chunk) self.text = text #print(text) # TEXT_ = f'Chapter06_text.txt' # saving_path = '/content/' with open('text.txt', 'w', encoding="utf-8") as file: file.write(text) return self.text def extract_keywords_rake(self): rake = Rake() self.extract_text_fom_html() keywords_Rake = rake.apply(self.text) df_Rake =pd.DataFrame(keywords_Rake) df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_Rake = self.clean(df_Rake) df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None) def extract_keywords_gensim(self): self.extract_text_fom_html() keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters df_gensim =pd.DataFrame(keywords_gensim) df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_gensim = self.clean(df_gensim) df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None) def extract_keywords_yake(self): self.extract_text_fom_html() kw_extractor = yake.KeywordExtractor(top=100, stopwords=None) keywords_yake = kw_extractor.extract_keywords(self.text) df_yake =pd.DataFrame(keywords_yake) df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_yake = self.clean(df_yake) df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None) # for kw, v in keywords_yake: # print("Keyphrase: ",kw, ": score", v) def extract_keywords_textrank(self): self.extract_text_fom_html() keywords_textrank = keywords.keywords(self.text, scores=True) df_textrank = pd.DataFrame(keywords_textrank) df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_textrank = self.clean(df_textrank) df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None) def extract_keywords_keyBERT(self): kw_model = KeyBERT(model='all-mpnet-base-v2') keywords_keyBERT = kw_model.extract_keywords(self.text, keyphrase_ngram_range=(1, 2), stop_words='english', highlight=True, top_n=10) def main(self): if method == 'rake': self.extract_keywords_rake() elif method == 'yake': self.extract_keywords_yake() elif method == 'gensim': self.extract_keywords_gensim() elif method == 'textrank': self.extract_keywords_textrank() elif method == 'keyBERT': self.extract_keywords_keyBERT() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--html_path', required=True, help='give the path where your html lives: /...') parser.add_argument('--saving_path', required=True, help='path of the folder where you want to save the files : /...' ) parser.add_argument('--method', required=True, choices=['rake','yake','gensim','keyBERT','textrank'], help='which method you want to us to extact keywords /...') args = parser.parse_args() html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html' saving_path = args.saving_path #'/content/' method = args.method keyword_extractions = keyword_extraction(html_path,saving_path,method) keyword_extractions.main()``` — Reply to this email directly, view it on GitHub <#40>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAFTCS5GNOMROWVWAASIARDY2JXP3AVCNFSM6AAAAABFKKEF32VHI2DSMVQWIX3LMV43ASLTON2WKOZSGIYTAMBVGU3TQNI> . You are receiving this because you are subscribed to this thread.Message ID: ***@***.***>

-- Peter Murray-Rust Founder ContentMine.org and Reader Emeritus in Molecular Informatics Dept. Of Chemistry, University of Cambridge, CB2 1EW, UK

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

flower1430 commented Mar 27, 2024

petermr commented Mar 27, 2024 via email

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Comments

flower1430 commented Mar 27, 2024

petermr commented Mar 27, 2024 via email