hack2skill · ved115 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/stratzy/PDFScraper.py b/stratzy/PDFScraper.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import os
+from bs4 import BeautifulSoup
+import requests
+import sys
+import datetime
+import shutil
+
+def getPDFLinks(tickers):
+    if not isinstance(tickers, list): return "Provide only a list!"
+    links = {}
+    for t in tickers:
+        URL = 'https://www.screener.in/company/' + t + '/consolidated/'
+        req = requests.get(URL)
+        soup = BeautifulSoup(req.content, 'html5lib')
+
+        try: earnings_links_with_date = soup.find(class_='documents concalls flex-column').find('ul').find_all('li')
+        except:
+            links[t] = None
+            continue
+        earnings_dict = {}
+
+        for li in earnings_links_with_date:
+            link = li.findChild(title="Raw Transcript")
+            if not link: continue
+            else: 
+                d = li.findChild('div').string.replace(' ','').replace('\n','')
+                d = datetime.datetime.strptime(d,'%b%Y').strftime('%Y-%m-%d')
+                earnings_dict[d] = link.attrs['href']
+        links[t] = earnings_dict[sorted(list(earnings_dict.keys()))[-1]]
+
+    return links
+
+tickers = ['RELIANCE','CIPLA'] # Input any list of tickers
+
+links = getPDFLinks(tickers) #FORMAT: {Ticker:Link, Ticker:Link}
+path = 'Documents'
+if os.path.exists(path) and os.path.isdir(path):
+    shutil.rmtree(path)
+os.mkdir('Documents')
+headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
+for t,l in links.items():
+    print(t,l)
+    r = requests.get(l,headers=headers)
+    with open('Documents/{}.pdf'.format(t), 'wb') as f:
+        f.write(r.content)
diff --git a/stratzy/PDFSummary.py b/stratzy/PDFSummary.py
@@ -0,0 +1,90 @@
+from langchain.text_splitter import CharacterTextSplitter,NLTKTextSplitter #text splitter
+from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
+from langchain import HuggingFaceHub
+from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
+from langchain.chains import RetrievalQA, LLMChain
+from langchain.llms import OpenAI
+from langchain.document_loaders import PyPDFLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.chains import ConversationalRetrievalChain
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from langchain.memory import ConversationBufferMemory
+import nltk 
+import redis
+import os
+import json
+os.environ["OPEN_API_KEY"] = "J3dWgPvniwfXTWCOry2yT3BlbkFJkGxhfh9GYeXK1IL0ZEj0"
+
+llm = OpenAI(openai_api_key="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",temperature=0)
+
+def getLLM(path):
+    loader = PyPDFLoader(path)
+    index = VectorstoreIndexCreator(
+        embedding=OpenAIEmbeddings(openai_api_key="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"),
+        text_splitter=NLTKTextSplitter(chunk_size=50000)).from_loaders([loader])
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    chain = ConversationalRetrievalChain.from_llm(llm, index.vectorstore.as_retriever(), memory=memory,chain_type='map_reduce')
+    return chain, memory, index
+
+def formatBulletList(x):
+    if len(x[0]) < 5: x.pop(0) 
+    modified_summ = ''
+    for i in x: modified_summ += '•' + i
+    modified_summ = '\n' + modified_summ
+    return modified_summ
+
+def getSpeakers(chain,memory,ticker): 
+    memory.clear()
+    speakers = chain({'question':"The transcript starts with the management speaking. Who are the speakers from management? Print ONLY a python list. Only print the names. Do not include anyone who is not from {}.".format(ticker)})['answer']
+    speakers = chain({'question':"Only keep the names of speakers which are from management. Print ONLY a python list and make sure the formatting will work inside an eval statement."})['answer']
+    speakers = eval(speakers)
+    return speakers
+
+def getSummary(chain,memory,speakers):
+    speakers_with_summary = {}
+    for s in speakers:
+        memory.clear()
+        print(s)
+        summ = chain({'question':"Print a summary of what {} said in the transcript in 10 unordered bullet points. For each bullet point, include context surrounding the idea so it is easily understood. The list should include key information about the business and plans for the future. The summary should not only focus on metrics and numbers but also highlight {}'s narrative and vision for the company\'s future. Each bullet point should end with a period.".format(s,s)})['answer']
+        if summ[-1] not in ['\n','.']: 
+            answer_split = summ.split('•')
+            incomplete = chain({'question':'continue'})['answer']
+            completed_point = llm('Print the correct combination of these two sentences. Sentence 1: {} Sentence 2: {}'.format(answer_split[-1],incomplete)).replace('\n','').replace('\"','')
+            sentences = nltk.sent_tokenize(completed_point)
+            sentences = [' ' + x + '\n' for x in sentences]
+            answer_split.pop()
+            answer_split = answer_split + sentences
+            summ = formatBulletList(answer_split)
+        speakers_with_summary[s] = summ
+    return speakers_with_summary
+
+
+def getJSON(speakers_with_summ,index):
+    points_json = []
+    for k,v in speakers_with_summ.items():
+        for p in v.split('•'):
+            if len(p) < 5: continue
+            pgnum = index.vectorstore.similarity_search_with_relevance_scores('Where is this line: {}'.format(p))[-1][0].dict()['metadata']['page']
+            points_json.append({'name':p,'speaker':k,'pageNumber':pgnum})
+    return points_json
+
+def driver(tickers):
+    for ticker in tickers:
+        print(ticker)
+        path = 'Documents/' + ticker + '.pdf' # S3 Path
+        chain, memory, index = getLLM(path)
+        print('LLM Initialised')
+        speakers = getSpeakers(chain,memory,ticker)
+        print('Speakers Detected')
+        print(speakers)
+        summary = getSummary(chain,memory,speakers)
+        for k,v in summary.items(): 
+            print(k,v)
+        print('Transcript Summarised')
+        json_summary = getJSON(summary,index)
+        print('Converted to JSON')
+        print(json_summary)
+
+tickers = [t[:-4] for t in os.listdir('Documents')]
+driver(tickers)
diff --git a/stratzy/README.md b/stratzy/README.md
@@ -0,0 +1,21 @@
+# Empowering-Investors-Hackathon
+
+#### Team Name - stratzy
+#### Problem Statement - Content Curation and Education - Retail Investors not getting time/interest in reading or understanding management concalls/transcripts
+#### Team Leader Email - [email protected]
+
+## A Brief of the Prototype:
+<img width="714" alt="Screenshot 2023-08-24 at 3 26 34 PM" src="https://github.com/ved115/Empowering-Investors-Hackathon/assets/7903563/3489b977-f02c-4f0b-88ea-e7d99b2460ce">
+
+  We use an LLM model to create a summary of the management concalls which allows retail investors to quickly glance over the main pointers organised by each speaker. Each pointer is clickable which takes them to the page on the pdf where the information in the pointer is referenced from.
+
+## Tech Stack: 
+   Python, GPT 3.5
+   Libraries: LangChain, BeautifulSoup, nltk
+
+## Step-by-Step Code Execution Instructions:
+  1. Run the PDFScraper.py file to obtain PDF links for each of the tickers. Each link corresponds to the concall transcript for the latest earnings release for that company. This will also download PDFs for each ticker and save them to the local directory.
+  2. Run PDFSummary.py and the summaries will be printed for each ticker. The page numbers will also be saved for each point in a JSON format for further analysis.
+
+## What I Learned:
+   LLMs are much more complex than they seem to be. It is quite difficult to obtain perfect output each time one interacts with an LLM such as GPT 3.5. For example, most of the time when GPT is prompted to produce code, it is not written perfectly and usually contains bugs that prevent it from being run. When working on this project, the main idea was to produce consistent outputs that do not change significantly when running on the same inputs. Obtaining deterministic behaviour was crucial in developing this tool.