Skip to content

Commit

Permalink
3.3.2 - done. using strings for choosing the scorer (default is gpt2)…
Browse files Browse the repository at this point in the history
…. Using poetry for dependency managment. Added some instructions for choosing the scorer in README.md. `main_streamlit.py` largely updated - with reports of top candidates now included in the explanation.
  • Loading branch information
Jordan Kim committed Apr 2, 2024
1 parent 4cb6f23 commit e7a1767
Show file tree
Hide file tree
Showing 11 changed files with 2,531 additions and 166 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,32 @@ pprint(styler.logs['guess']['out'])
0.0125)]
```

### 5️⃣ Take contexts into account with the `gpt2` scorer

`heuristic` scorer is fast, but falls short at taking context into account:
```python
styler = Styler(scorer="heuristic")
print("##### lm을 쓰지 않는 경우 맥락 고려 X ######")
print(styler("내일 저랑 같이 점심 먹어요.", 0))
```

```
##### lm을 쓰지 않는 경우 맥락 고려 X ######
내일 나랑 같이 점심 먹어.
```

`gpt2` scorer is a bit slower, but does take context into account:
```python
from politely.modeling_gpt2_scorer import GPT2Scorer
styler = Styler(scorer="gpt2") # uses GPT2Scorer by default
print("##### lm을 쓰는 경우 맥락 고려 O ######")
print(styler("내일 저랑 같이 점심 먹어요.", 0))
```

```
##### lm을 쓰는 경우 맥락 고려 O ######
내일 나랑 같이 점심 먹자. # 권유가 아닌 청유이므로 이게 맞음
```

## Hosting the interactive demo

Expand Down
2 changes: 1 addition & 1 deletion explore/add_rules_eg_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
}
)
print(styler(sent, 1))
pprint(styler.logs['guess']['out'])
pprint(styler.log['guess']['out'])
6 changes: 3 additions & 3 deletions explore/explore_politely_Styler_shortcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

def main():
styler = Styler()
print(styler.logs)
print(styler.logs)
print(styler.log)
print(styler.log)
print(styler("나한테 왜 그런거야?", 2))
pprint(styler.logs)
pprint(styler.log)


if __name__ == "__main__":
Expand Down
Empty file.
8 changes: 4 additions & 4 deletions main_demo_contextual.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from politely import Styler
from politely.modeling_gpt2_scorer import GPT2Scorer
from politely.modeling_heuristic_scorer import HeuristicScorer

styler = Styler(scorer=HeuristicScorer())
styler = Styler(scorer="heuristic")
print("##### lm을 쓰지 않는 경우 맥락 고려 X ######")
print(styler("내일 저랑 같이 점심 먹어요.", 0))

styler = Styler(scorer=GPT2Scorer()) # uses GPT2Scorer by default
print(styler.log.keys())

styler = Styler(scorer="gpt2") # uses GPT2Scorer by default
print("##### lm을 쓰는 경우 맥락 고려 O ######")
print(styler("내일 저랑 같이 점심 먹어요.", 0))

Expand Down
152 changes: 89 additions & 63 deletions main_streamlit.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,69 @@
"""
It's okay to write dirty stuff, at least as of right now.
"""
import re
from pprint import pprint

from copy import deepcopy
import pprint
import streamlit as st
import pandas as pd # noqa
import os
import requests # noqa
import yaml # noqa
import pandas as pd
import yaml
import openai
from politely import Styler, SEP
from politely.errors import SFNotIncludedError, EFNotSupportedError
from dotenv import load_dotenv
from loguru import logger
load_dotenv()


# --- constants --- #
RULES_YAML_STR = """friends and junior:
comfortable & informal:
politeness: 1
politeness: 0
reason: A comfortable and informal situation is a very relaxed situation for all, so you may speak to your friends and juniors in a casual style (`-어`).
formal:
politeness: 2
politeness: 1
reason: If there are observers around or the situation is rather formal, then you and your listener may not find it completely relaxing. If so, you should speak in a polite style (`-어요`) even when you are speaking to your friends and juniors.
boss at work:
comfortable & informal:
politeness: 2
politeness: 1
reason: If you are in an informal situation with your boss, e.g. a company dinner, then you and your boss may find it a little more relaxing than at the work place. Therefore, it is not necessary to speak in a formal style, and you may speak to your boss in a polite style (`-어요`).
formal:
politeness: 3
politeness: 2
reason: If you are in a highly formal environment, e.g. an important meeting, you should always speak in a formal style (`-읍니다`). This shows the appropriate respect to your listeners in a high-profile context.
adult family:
comfortable & informal:
politeness: 1
politeness: 0
reason: If you are in a relaxed setting, it is customary and allowed to speak to your family members in a casual style (`-어`) even when they are older than you.
formal:
politeness: 2
politeness: 1
reason: If someone outside of your family, e.g. a neighbour, is partaking the conversation too, then it is customary to speak to your family in a polite style (`-어요`) so that you and your family come acorss polite to the outsiders."""
RULES = yaml.safe_load(RULES_YAML_STR)
LISTENERS = pd.DataFrame(RULES).transpose().index.tolist()
ENVIRONS = pd.DataFrame(RULES).transpose().columns.tolist()

# change the papago API with GPT-3.5-turbo

SYSTEM_PROMPT = """
you are a masterful translator of English to Korean.
Translate the following English sentence(s) given by the user to Korean sentence(s).
When more than one sentence is given as an input, give your translation in multiple sentences accordingly (merge them to one if appropriate).
"""

def translate(text: str) -> str:
url = "https://openapi.naver.com/v1/papago/n2mt"
headers = {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Naver-Client-Id": os.environ["NAVER_CLIENT_ID"],
"X-Naver-Client-Secret": os.environ["NAVER_CLIENT_SECRET"],
}
data = {"source": "en", "target": "ko", "text": text, "honorific": False}
r = requests.post(url, headers=headers, data=data)
r.raise_for_status()
return r.json()["message"]["result"]["translatedText"]
r = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system",
"content": SYSTEM_PROMPT},
{"role": "user",
"content": text}
]
)
translated_text = r.choices[0].message.content
return translated_text


def explain(logs: dict, eng: str):
def explain(logs: list[dict], eng: str):
# CSS to inject contained in a string
hide_table_row_index = """
<style>
Expand All @@ -64,74 +74,90 @@ def explain(logs: dict, eng: str):
# Inject CSS with Markdown
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# --- step 1 ---
msg = "### 1️⃣ Translate the sentence(s) to Korean"
before = eng
after = " ".join(logs["__call__"]["in"]["sents"])
df = pd.DataFrame([(before, after)], columns=["before", "after"])
msg = "### 1️⃣ Translate the sentence to Korean"
st.markdown(msg)
st.markdown(df.to_markdown(index=False))
for log in logs:
before = eng
logger.debug(pprint.pformat(log.keys()))
after: str = log["preprocess"]["in"]["sent"]
df = pd.DataFrame([(before, after)], columns=["before", "after"])
st.markdown(df.to_markdown(index=False))
# --- step 2 ---
msg = "### 2️⃣ Determine politeness"
politeness = logs["__call__"]["in"]["politeness"]
log = logs[0] # anything will suffice
politeness = log["honorify"]["in"]["politeness"]
politeness = (
"casual style (-어)"
if politeness == 1
else "polite style (-어요)"
if politeness == 2
else "formal style (-습니다)"
"casual style (-어)"
if politeness == 1
else "polite style (-어요)"
if politeness == 2
else "formal style (-습니다)"
)
reason = logs["case"]["reason"]
reason = log["case"]["reason"]
msg += (
f"\nYou should speak in a `{politeness}` to your `{logs['listener']}`"
f" when you are in a `{logs['environ']}` environment."
f"\nYou should speak in a `{politeness}` to your `{log['listener']}`"
f" when you are in a `{log['environ']}` environment."
)
msg += f"\n\n Why so? {reason}"
st.markdown(msg)
# --- step 3 ---
msg = f"### 3️⃣ Analyze morphemes"
pprint(logs['__call__']['in'])
pprint(logs['analyze']['in']['self'].out)

before = after
after = " ".join(logs["analyze"]["out"]).replace(SEP, " ")
df = pd.DataFrame([(before, after)], columns=["before", "after"])
st.markdown(msg)
st.markdown(df.to_markdown(index=False))
for log in logs:
before = " ".join(log["preprocess"]["in"]["sent"])
after = " ".join(log["analyze"]["out"]).replace(SEP, " ")
df = pd.DataFrame([(before, after)], columns=["before", "after"])
st.markdown(df.to_markdown(index=False))
# --- step 4 ---
msg = f"### 4️⃣ Apply honorifics"
before = SEP.join(logs["analyze"]["out"])
after = SEP.join(logs["honorify"]["out"])
# for pattern, honorific in logs['honorifics']:
# before = re.sub(pattern, r'`\g<0>`', before)
# after = re.sub(pattern, honorific, before)
df = pd.DataFrame(
[(before, after)], columns=["before", "after"]
)
st.markdown(msg)
st.markdown(df.to_markdown(index=False))
for log in logs:
st.markdown("Honorifics applied:")
before = log["analyze"]["out"]
print("elect - out", log["elect"]["out"])
after = SEP.join(log["elect"]["out"][0])
df = pd.DataFrame(
[(before, after)], columns=["before", "after"]
)
st.markdown(df.to_markdown(index=False))
st.markdown("Top candidates:")
top_3_candidate_pairs = list(sorted(log['guess']['out'], key= lambda x: x[1], reverse=True))[:2]
top_3_candidates = [SEP.join(candidate) for candidate, _ in top_3_candidate_pairs]
top_3_scores = [score for _, score in top_3_candidate_pairs]
df = pd.DataFrame(list(zip(top_3_candidates, top_3_scores)), columns=["candidate", "score"])
st.markdown(df.to_markdown(index=False))
st.markdown("---")

# # --- step 5 ---
msg = "### 5️⃣ Conjugate morphemes"
before = " ".join(logs["honorify"]["out"]).replace(SEP, " ")
after = " ".join(logs["conjugate"]["out"])
df = pd.DataFrame([(before, after)], columns=["before", "after"])
st.markdown(msg)
st.markdown(df.to_markdown(index=False))
for log in logs:
before = SEP.join(log["elect"]["out"][0])
after = " ".join(log["conjugate"]["out"])
df = pd.DataFrame([(before, after)], columns=["before", "after"])
st.markdown(df.to_markdown(index=False))


def describe_case(styler: Styler, eng: str, kor: str, listener: str, environ: str):
try:
logs = list()
tuned_sents = list()
case = RULES[listener][environ]
sents = [sent.text for sent in styler.kiwi.split_into_sents(kor)]
tuned = styler(sents, case["politeness"])
logger.debug(f"listener: {listener}, environ: {environ}, sents: {sents}")
for sent in sents:
tuned_sent = styler(sent, case["politeness"])
tuned_sents.append(tuned_sent)
styler.log.update({"listener": listener, "environ": environ, "case": case})
logs.append(deepcopy(styler.log))
except SFNotIncludedError as e1:
st.error("ERROR: " + str(e1))
except EFNotSupportedError as e2:
st.error("ERROR: " + str(e2))
else:
st.write(" ".join(tuned))
st.write(" ".join(tuned_sents))
with st.expander("Need an explanation?"):
styler.logs.update({"listener": listener, "environ": environ, "case": case})
explain(styler.logs, eng)
explain(logs, eng)


def main():
Expand All @@ -141,7 +167,7 @@ def main():
"- 💡: [Jieun Kiaer](https://www.orinst.ox.ac.uk/people/jieun-kiaer) & [Eu-Bin"
" KIM](https://github.com/eubinecto) @ the Univerity of Oxford\n- ⚡️:"
" [`kiwipiepy`](https://github.com/bab2min/kiwipiepy) for analyzing Korean morphemes &"
" [`papago`](https://papago.naver.com/?sk=auto&tk=ko&hn=1&st=hello%20world) for"
" `gpt-3.5-turbo`for"
" english-to-korean translations\n- The code that runs this website is"
" [publicly available on Github](https://github.com/eubinecto/kps). Please"
" leave a ⭐ if you like what we are building!"
Expand All @@ -151,7 +177,7 @@ def main():
"Type English sentences to translate with honorifics",
value="Bring your work to fruition. Done is better than perfect.",
)
styler = Styler(strict=True)
styler = Styler(strict=True, scorer="gpt2")
if st.button(label="Translate"):
with st.spinner("Please wait..."):
kor = translate(eng)
Expand Down
Loading

0 comments on commit e7a1767

Please sign in to comment.