From e56b78eb6098bc9a2377a0cb72a5377589b2b593 Mon Sep 17 00:00:00 2001 From: Ved Sirdeshmukh Date: Thu, 24 Aug 2023 15:24:45 +0530 Subject: [PATCH 1/4] Create PDFScraper.py --- stratzy/PDFScraper.py | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 stratzy/PDFScraper.py diff --git a/stratzy/PDFScraper.py b/stratzy/PDFScraper.py new file mode 100644 index 0000000..3b494aa --- /dev/null +++ b/stratzy/PDFScraper.py @@ -0,0 +1,46 @@ +import pandas as pd +import os +from bs4 import BeautifulSoup +import requests +import sys +import datetime +import shutil + +def getPDFLinks(tickers): + if not isinstance(tickers, list): return "Provide only a list!" + links = {} + for t in tickers: + URL = 'https://www.screener.in/company/' + t + '/consolidated/' + req = requests.get(URL) + soup = BeautifulSoup(req.content, 'html5lib') + + try: earnings_links_with_date = soup.find(class_='documents concalls flex-column').find('ul').find_all('li') + except: + links[t] = None + continue + earnings_dict = {} + + for li in earnings_links_with_date: + link = li.findChild(title="Raw Transcript") + if not link: continue + else: + d = li.findChild('div').string.replace(' ','').replace('\n','') + d = datetime.datetime.strptime(d,'%b%Y').strftime('%Y-%m-%d') + earnings_dict[d] = link.attrs['href'] + links[t] = earnings_dict[sorted(list(earnings_dict.keys()))[-1]] + + return links + +tickers = ['RELIANCE','CIPLA'] # Input any list of tickers + +links = getPDFLinks(tickers) #FORMAT: {Ticker:Link, Ticker:Link} +path = 'Documents' +if os.path.exists(path) and os.path.isdir(path): + shutil.rmtree(path) +os.mkdir('Documents') +headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'} +for t,l in links.items(): + print(t,l) + r = requests.get(l,headers=headers) + with open('Documents/{}.pdf'.format(t), 'wb') as f: + f.write(r.content) From 81d48dd0741f93772bf0ccb6e3703280513fbe34 Mon Sep 17 00:00:00 2001 From: Ved Sirdeshmukh Date: Thu, 24 Aug 2023 15:25:19 +0530 Subject: [PATCH 2/4] Add files via upload --- stratzy/PDFSummary.py | 94 +++++++++++++++++++++++++++++++++++++++++++ stratzy/README.md | 21 ++++++++++ 2 files changed, 115 insertions(+) create mode 100644 stratzy/PDFSummary.py create mode 100644 stratzy/README.md diff --git a/stratzy/PDFSummary.py b/stratzy/PDFSummary.py new file mode 100644 index 0000000..6659ea7 --- /dev/null +++ b/stratzy/PDFSummary.py @@ -0,0 +1,94 @@ +from langchain.text_splitter import CharacterTextSplitter,NLTKTextSplitter #text splitter +from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models +from langchain import HuggingFaceHub +from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb +from langchain.chains import RetrievalQA, LLMChain +from langchain.llms import OpenAI +from langchain.document_loaders import PyPDFLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.chains import ConversationalRetrievalChain +from langchain.prompts import PromptTemplate +from langchain.chains import RetrievalQA +from langchain.memory import ConversationBufferMemory +import nltk +import redis +import os +import json +os.environ["OPEN_API_KEY"] = "J3dWgPvniwfXTWCOry2yT3BlbkFJkGxhfh9GYeXK1IL0ZEj0" +#r = redis.StrictRedis(host='10.0.0.105', port=6379, db=0) + +llm = OpenAI(openai_api_key="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",temperature=0) + +def getLLM(path): + loader = PyPDFLoader(path) + index = VectorstoreIndexCreator( + embedding=OpenAIEmbeddings(openai_api_key="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"), + text_splitter=NLTKTextSplitter(chunk_size=50000)).from_loaders([loader]) + memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) + chain = ConversationalRetrievalChain.from_llm(llm, index.vectorstore.as_retriever(), memory=memory,chain_type='map_reduce') + return chain, memory, index + +def formatBulletList(x): + if len(x[0]) < 5: x.pop(0) + modified_summ = '' + for i in x: modified_summ += '•' + i + modified_summ = '\n' + modified_summ + return modified_summ + +def getSpeakers(chain,memory,ticker): + memory.clear() + speakers = chain({'question':"The transcript starts with the management speaking. Who are the speakers from management? Print ONLY a python list. Only print the names. Do not include anyone who is not from {}.".format(ticker)})['answer'] + speakers = chain({'question':"Only keep the names of speakers which are from management. Print ONLY a python list and make sure the formatting will work inside an eval statement."})['answer'] + speakers = eval(speakers) + return speakers + +def getSummary(chain,memory,speakers): + speakers_with_summary = {} + for s in speakers: + memory.clear() + print(s) + summ = chain({'question':"Print a summary of what {} said in the transcript in 10 unordered bullet points. For each bullet point, include context surrounding the idea so it is easily understood. The list should include key information about the business and plans for the future. The summary should not only focus on metrics and numbers but also highlight {}'s narrative and vision for the company\'s future. Each bullet point should end with a period.".format(s,s)})['answer'] + if summ[-1] not in ['\n','.']: + answer_split = summ.split('•') + incomplete = chain({'question':'continue'})['answer'] + completed_point = llm('Print the correct combination of these two sentences. Sentence 1: {} Sentence 2: {}'.format(answer_split[-1],incomplete)).replace('\n','').replace('\"','') + sentences = nltk.sent_tokenize(completed_point) + sentences = [' ' + x + '\n' for x in sentences] + answer_split.pop() + answer_split = answer_split + sentences + summ = formatBulletList(answer_split) + speakers_with_summary[s] = summ + return speakers_with_summary + + +def getJSON(speakers_with_summ,index): + points_json = [] + for k,v in speakers_with_summ.items(): + for p in v.split('•'): + if len(p) < 5: continue + pgnum = index.vectorstore.similarity_search_with_relevance_scores('Where is this line: {}'.format(p))[-1][0].dict()['metadata']['page'] + points_json.append({'name':p,'speaker':k,'pageNumber':pgnum}) + return points_json + +def driver(tickers): + for ticker in tickers: + print(ticker) + path = 'Documents/' + ticker + '.pdf' # S3 Path + chain, memory, index = getLLM(path) + print('LLM Initialised') + speakers = getSpeakers(chain,memory,ticker) + print('Speakers Detected') + print(speakers) + summary = getSummary(chain,memory,speakers) + for k,v in summary.items(): + print(k,v) + print('Transcript Summarised') + json_summary = getJSON(summary,index) + print('Converted to JSON') + print(json_summary) + + # #-------- Save the file path to redis -----------# + # r.set('earnings_summary_{}'.format(ticker), json.dumps(json_summary)) + +tickers = [t[:-4] for t in os.listdir('Documents')] +driver(tickers) \ No newline at end of file diff --git a/stratzy/README.md b/stratzy/README.md new file mode 100644 index 0000000..8031e60 --- /dev/null +++ b/stratzy/README.md @@ -0,0 +1,21 @@ +# Empowering-Investors-Hackathon + +#### Team Name - stratzy +#### Problem Statement - Content Curation and Education - Retail Investors not getting time/interest in reading or understanding management concalls/transcripts +#### Team Leader Email - mohit@stratzy.in + +## A Brief of the Prototype: +Diagram + + We use an LLM model to create a summary of the management concalls which allows retail investors to quickly glance over the main pointers organised by each speaker. Each pointer is clickable which takes them to the page on the pdf where the information in the pointer is referenced from. + +## Tech Stack: + Python, GPT 3.5 + Libraries: LangChain, BeautifulSoup, nltk + +## Step-by-Step Code Execution Instructions: + 1. Run the PDFScraper.py file to obtain PDF links for each of the tickers. Each link corresponds to the concall transcript for the latest earnings release for that company. This will also download PDFs for each ticker and save them to the local directory. + 2. Run PDFSummary.py and the summaries will be printed for each ticker. The page numbers will also be saved for each point in a JSON format for further analysis. + +## What I Learned: + LLMs are much more complex than they seem to be. It is quite difficult to obtain perfect output each time one interacts with an LLM such as GPT 3.5. For example, most of the time when GPT is prompted to produce code, it is not written perfectly and usually contains bugs that prevent it from being run. When working on this project, the main idea was to produce consistent outputs that do not change significantly when running on the same inputs. Obtaining deterministic behaviour was crucial in developing this tool. From eb7da542e288bb5bc63570ad5ae2ac5f22138400 Mon Sep 17 00:00:00 2001 From: Ved Sirdeshmukh Date: Thu, 24 Aug 2023 15:27:00 +0530 Subject: [PATCH 3/4] Update README.md --- stratzy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stratzy/README.md b/stratzy/README.md index 8031e60..c033d27 100644 --- a/stratzy/README.md +++ b/stratzy/README.md @@ -5,7 +5,7 @@ #### Team Leader Email - mohit@stratzy.in ## A Brief of the Prototype: -Diagram +Screenshot 2023-08-24 at 3 26 34 PM We use an LLM model to create a summary of the management concalls which allows retail investors to quickly glance over the main pointers organised by each speaker. Each pointer is clickable which takes them to the page on the pdf where the information in the pointer is referenced from. From ad99366a440aa53ebf285b77c59ffe4cb913fd3d Mon Sep 17 00:00:00 2001 From: Ved Sirdeshmukh Date: Thu, 24 Aug 2023 15:27:28 +0530 Subject: [PATCH 4/4] Update PDFSummary.py --- stratzy/PDFSummary.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/stratzy/PDFSummary.py b/stratzy/PDFSummary.py index 6659ea7..bf084ee 100644 --- a/stratzy/PDFSummary.py +++ b/stratzy/PDFSummary.py @@ -15,7 +15,6 @@ import os import json os.environ["OPEN_API_KEY"] = "J3dWgPvniwfXTWCOry2yT3BlbkFJkGxhfh9GYeXK1IL0ZEj0" -#r = redis.StrictRedis(host='10.0.0.105', port=6379, db=0) llm = OpenAI(openai_api_key="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",temperature=0) @@ -87,8 +86,5 @@ def driver(tickers): print('Converted to JSON') print(json_summary) - # #-------- Save the file path to redis -----------# - # r.set('earnings_summary_{}'.format(ticker), json.dumps(json_summary)) - tickers = [t[:-4] for t in os.listdir('Documents')] -driver(tickers) \ No newline at end of file +driver(tickers)