3.3.2 - done. using strings for choosing the scorer (default is gpt2)…

…. Using poetry for dependency managment. Added some instructions for choosing the scorer in README.md. `main_streamlit.py` largely updated - with reports of top candidates now included in the explanation.
eubinecto · Apr 2, 2024 · e7a1767 · e7a1767
1 parent 4cb6f23
commit e7a1767
Show file tree

Hide file tree

Showing 11 changed files with 2,531 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -136,7 +136,32 @@ pprint(styler.logs['guess']['out'])
   0.0125)]
 ```
 
+### 5️⃣ Take contexts into account with the `gpt2` scorer 
 
+`heuristic` scorer is fast, but falls short at taking context into account:
+```python
+styler = Styler(scorer="heuristic")
+print("##### lm을 쓰지 않는 경우 맥락 고려 X ######")
+print(styler("내일 저랑 같이 점심 먹어요.", 0))
+```
+
+```
+##### lm을 쓰지 않는 경우 맥락 고려 X ######
+내일 나랑 같이 점심 먹어.
+```
+
+`gpt2` scorer is a bit slower, but does take context into account:
+```python
+from politely.modeling_gpt2_scorer import GPT2Scorer
+styler = Styler(scorer="gpt2")  # uses GPT2Scorer by default
+print("##### lm을 쓰는 경우 맥락 고려 O ######")
+print(styler("내일 저랑 같이 점심 먹어요.", 0))
+```
+
+```
+##### lm을 쓰는 경우 맥락 고려 O ######
+내일 나랑 같이 점심 먹자.  # 권유가 아닌 청유이므로 이게 맞음
+```
 
 ## Hosting the interactive demo 
 

diff --git a/explore/add_rules_eg_2.py b/explore/add_rules_eg_2.py
@@ -24,4 +24,4 @@
     }
 )
 print(styler(sent, 1))
-pprint(styler.logs['guess']['out'])
+pprint(styler.log['guess']['out'])
diff --git a/explore/explore_politely_Styler_shortcut.py b/explore/explore_politely_Styler_shortcut.py
@@ -4,10 +4,10 @@
 
 def main():
     styler = Styler()
-    print(styler.logs)
-    print(styler.logs)
+    print(styler.log)
+    print(styler.log)
     print(styler("나한테 왜 그런거야?", 2))
-    pprint(styler.logs)
+    pprint(styler.log)
 
 
 if __name__ == "__main__":

diff --git a/explore/explore_politely_no_call_.py b/explore/explore_politely_no_call_.py
diff --git a/main_demo_contextual.py b/main_demo_contextual.py
@@ -1,12 +1,12 @@
 from politely import Styler
-from politely.modeling_gpt2_scorer import GPT2Scorer
-from politely.modeling_heuristic_scorer import HeuristicScorer
 
-styler = Styler(scorer=HeuristicScorer())
+styler = Styler(scorer="heuristic")
 print("##### lm을 쓰지 않는 경우 맥락 고려 X ######")
 print(styler("내일 저랑 같이 점심 먹어요.", 0))
 
-styler = Styler(scorer=GPT2Scorer())  # uses GPT2Scorer by default
+print(styler.log.keys())
+
+styler = Styler(scorer="gpt2")  # uses GPT2Scorer by default
 print("##### lm을 쓰는 경우 맥락 고려 O ######")
 print(styler("내일 저랑 같이 점심 먹어요.", 0))
 

diff --git a/main_streamlit.py b/main_streamlit.py
@@ -1,59 +1,69 @@
 """
 It's okay to write dirty stuff, at least as of right now.
 """
-import re
-from pprint import pprint
 
+from copy import deepcopy
+import pprint
 import streamlit as st
-import pandas as pd  # noqa
-import os
-import requests  # noqa
-import yaml  # noqa
+import pandas as pd 
+import yaml
+import openai
 from politely import Styler, SEP
 from politely.errors import SFNotIncludedError, EFNotSupportedError
+from dotenv import load_dotenv
+from loguru import logger
+load_dotenv()
 
 
 # --- constants --- #
 RULES_YAML_STR = """friends and junior:
   comfortable & informal:
-    politeness: 1
+    politeness: 0
     reason: A comfortable and informal situation is a very relaxed situation for all, so you may speak to your friends and juniors in a casual style (`-어`).
   formal:
-    politeness: 2
+    politeness: 1
     reason: If there are observers around or the situation is rather formal, then you and your listener may not find it completely relaxing. If so, you should speak in a polite style (`-어요`) even when you are speaking to your friends and juniors.
 boss at work:
   comfortable & informal:
-    politeness: 2
+    politeness: 1
     reason: If you are in an informal situation with your boss, e.g. a company dinner, then you and your boss may find it a little more relaxing than at the work place. Therefore, it is not necessary to speak in a formal style, and you may speak to your boss in a polite style (`-어요`).
   formal:
-    politeness: 3
+    politeness: 2
     reason: If you are in a highly formal environment, e.g. an important meeting, you should always speak in a formal style (`-읍니다`). This shows the appropriate respect to your listeners in a high-profile context.
 adult family:
   comfortable & informal:
-    politeness: 1
+    politeness: 0
     reason: If you are in a relaxed setting, it is customary and allowed to speak to your family members in a casual style (`-어`) even when they are older than you.
   formal:
-    politeness: 2
+    politeness: 1
     reason: If someone outside of your family, e.g. a neighbour, is partaking the conversation too, then it is customary to speak to your family in a polite style (`-어요`) so that you and your family come acorss polite to the outsiders."""
 RULES = yaml.safe_load(RULES_YAML_STR)
 LISTENERS = pd.DataFrame(RULES).transpose().index.tolist()
 ENVIRONS = pd.DataFrame(RULES).transpose().columns.tolist()
 
+# change the papago API with GPT-3.5-turbo
+
+SYSTEM_PROMPT = """
+you are a masterful translator of English to Korean.
+Translate the following English sentence(s) given by the user to Korean sentence(s).
+When more than one sentence is given as an input, give your translation in multiple sentences accordingly (merge them to one if appropriate).
+"""
 
 def translate(text: str) -> str:
-    url = "https://openapi.naver.com/v1/papago/n2mt"
-    headers = {
-        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
-        "X-Naver-Client-Id": os.environ["NAVER_CLIENT_ID"],
-        "X-Naver-Client-Secret": os.environ["NAVER_CLIENT_SECRET"],
-    }
-    data = {"source": "en", "target": "ko", "text": text, "honorific": False}
-    r = requests.post(url, headers=headers, data=data)
-    r.raise_for_status()
-    return r.json()["message"]["result"]["translatedText"]
+    r = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system",
+             "content": SYSTEM_PROMPT},
+            {"role": "user",
+             "content": text}
+        ]
+    )
+    translated_text = r.choices[0].message.content
+    return translated_text
 
 
-def explain(logs: dict, eng: str):
+def explain(logs: list[dict], eng: str):
     # CSS to inject contained in a string
     hide_table_row_index = """
                        <style>
@@ -64,74 +74,90 @@ def explain(logs: dict, eng: str):
     # Inject CSS with Markdown
     st.markdown(hide_table_row_index, unsafe_allow_html=True)
     # --- step 1 ---
-    msg = "### 1️⃣ Translate the sentence(s) to Korean"
-    before = eng
-    after = " ".join(logs["__call__"]["in"]["sents"])
-    df = pd.DataFrame([(before, after)], columns=["before", "after"])
+    msg = "### 1️⃣ Translate the sentence to Korean"
     st.markdown(msg)
-    st.markdown(df.to_markdown(index=False))
+    for log in logs:
+        before = eng
+        logger.debug(pprint.pformat(log.keys()))
+        after: str = log["preprocess"]["in"]["sent"]
+        df = pd.DataFrame([(before, after)], columns=["before", "after"])
+        st.markdown(df.to_markdown(index=False))
     # --- step 2 ---
     msg = "### 2️⃣ Determine politeness"
-    politeness = logs["__call__"]["in"]["politeness"]
+    log = logs[0]  #  anything will suffice
+    politeness = log["honorify"]["in"]["politeness"]
     politeness = (
-        "casual style (-어)"
-        if politeness == 1
-        else "polite style (-어요)"
-        if politeness == 2
-        else "formal style (-습니다)"
+    "casual style (-어)"
+    if politeness == 1
+    else "polite style (-어요)"
+    if politeness == 2
+    else "formal style (-습니다)"
     )
-    reason = logs["case"]["reason"]
+    reason = log["case"]["reason"]
     msg += (
-        f"\nYou should speak in a `{politeness}` to your `{logs['listener']}`"
-        f" when you are in a `{logs['environ']}` environment."
+        f"\nYou should speak in a `{politeness}` to your `{log['listener']}`"
+        f" when you are in a `{log['environ']}` environment."
     )
     msg += f"\n\n Why so? {reason}"
     st.markdown(msg)
     # --- step 3 ---
     msg = f"### 3️⃣ Analyze morphemes"
-    pprint(logs['__call__']['in'])
-    pprint(logs['analyze']['in']['self'].out)
-
-    before = after
-    after = " ".join(logs["analyze"]["out"]).replace(SEP, " ")
-    df = pd.DataFrame([(before, after)], columns=["before", "after"])
     st.markdown(msg)
-    st.markdown(df.to_markdown(index=False))
+    for log in logs:
+        before = " ".join(log["preprocess"]["in"]["sent"])
+        after = " ".join(log["analyze"]["out"]).replace(SEP, " ")
+        df = pd.DataFrame([(before, after)], columns=["before", "after"])
+        st.markdown(df.to_markdown(index=False))
     # --- step 4 ---
     msg = f"### 4️⃣ Apply honorifics"
-    before = SEP.join(logs["analyze"]["out"])
-    after = SEP.join(logs["honorify"]["out"])
-    # for pattern, honorific in logs['honorifics']:
-    #     before = re.sub(pattern, r'`\g<0>`', before)
-    #     after = re.sub(pattern, honorific, before)
-    df = pd.DataFrame(
-        [(before, after)], columns=["before", "after"]
-    )
     st.markdown(msg)
-    st.markdown(df.to_markdown(index=False))
+    for log in logs:
+        st.markdown("Honorifics applied:")
+        before = log["analyze"]["out"]
+        print("elect - out", log["elect"]["out"])
+        after = SEP.join(log["elect"]["out"][0])
+        df = pd.DataFrame(
+            [(before, after)], columns=["before", "after"]
+        )
+        st.markdown(df.to_markdown(index=False))
+        st.markdown("Top candidates:")
+        top_3_candidate_pairs = list(sorted(log['guess']['out'], key= lambda x: x[1], reverse=True))[:2]
+        top_3_candidates = [SEP.join(candidate) for candidate, _ in top_3_candidate_pairs]
+        top_3_scores = [score for _, score in top_3_candidate_pairs]
+        df = pd.DataFrame(list(zip(top_3_candidates, top_3_scores)), columns=["candidate", "score"])
+        st.markdown(df.to_markdown(index=False))
+        st.markdown("---")
+
     # # --- step 5 ---
     msg = "### 5️⃣ Conjugate morphemes"
-    before = " ".join(logs["honorify"]["out"]).replace(SEP, " ")
-    after = " ".join(logs["conjugate"]["out"])
-    df = pd.DataFrame([(before, after)], columns=["before", "after"])
     st.markdown(msg)
-    st.markdown(df.to_markdown(index=False))
+    for log in logs:
+        before = SEP.join(log["elect"]["out"][0])
+        after = " ".join(log["conjugate"]["out"])
+        df = pd.DataFrame([(before, after)], columns=["before", "after"])
+        st.markdown(df.to_markdown(index=False))
 
 
 def describe_case(styler: Styler, eng: str, kor: str, listener: str, environ: str):
     try:
+        logs = list()
+        tuned_sents = list()
         case = RULES[listener][environ]
         sents = [sent.text for sent in styler.kiwi.split_into_sents(kor)]
-        tuned = styler(sents, case["politeness"])
+        logger.debug(f"listener: {listener}, environ: {environ}, sents: {sents}")
+        for sent in sents:
+            tuned_sent = styler(sent, case["politeness"])
+            tuned_sents.append(tuned_sent)
+            styler.log.update({"listener": listener, "environ": environ, "case": case})
+            logs.append(deepcopy(styler.log))
     except SFNotIncludedError as e1:
         st.error("ERROR: " + str(e1))
     except EFNotSupportedError as e2:
         st.error("ERROR: " + str(e2))
     else:
-        st.write(" ".join(tuned))
+        st.write(" ".join(tuned_sents))
         with st.expander("Need an explanation?"):
-            styler.logs.update({"listener": listener, "environ": environ, "case": case})
-            explain(styler.logs, eng)
+            explain(logs, eng)
 
 
 def main():
@@ -141,7 +167,7 @@ def main():
         "- 💡: [Jieun Kiaer](https://www.orinst.ox.ac.uk/people/jieun-kiaer) & [Eu-Bin"
         " KIM](https://github.com/eubinecto) @ the Univerity of Oxford\n- ⚡️:"
         " [`kiwipiepy`](https://github.com/bab2min/kiwipiepy) for analyzing Korean morphemes &"
-        " [`papago`](https://papago.naver.com/?sk=auto&tk=ko&hn=1&st=hello%20world) for"
+        " `gpt-3.5-turbo`for"
         " english-to-korean translations\n- The code that runs this website is"
         " [publicly available on Github](https://github.com/eubinecto/kps). Please"
         " leave a ⭐ if you like what we are building!"
@@ -151,7 +177,7 @@ def main():
         "Type English sentences to translate with honorifics",
         value="Bring your work to fruition. Done is better than perfect.",
     )
-    styler = Styler(strict=True)
+    styler = Styler(strict=True, scorer="gpt2")
     if st.button(label="Translate"):
         with st.spinner("Please wait..."):
             kor = translate(eng)