Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new setting: prompt_column_separator #724

Merged
merged 15 commits into from
Jul 10, 2024
Merged
5 changes: 5 additions & 0 deletions documentation/docs/guide/experiments/experiment-settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import DSvalidationStrategy from '../../tooltips/experiments/_validation-strateg
import DSvalidationSize from '../../tooltips/experiments/_validation-size.mdx';
import DSdataSample from '../../tooltips/experiments/_data-sample.mdx';
import DSpromptColumn from '../../tooltips/experiments/_prompt-column.mdx';
import DSPromptColumnSeparator from '../../tooltips/experiments/_prompt-column-separator.mdx';
import DSsystemColumn from '../../tooltips/experiments/_system-column.mdx';
import DSanswerColumn from '../../tooltips/experiments/_answer-column.mdx';
import DSparentIdColumn from '../../tooltips/experiments/_parent-id-column.mdx';
Expand Down Expand Up @@ -141,6 +142,10 @@ The settings under each category are listed and described below.

<DSpromptColumn/>

### Prompt column separator

<DSpromptColumnSeparator/>

### Answer column

<DSanswerColumn/>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
If multiple prompt columns are selected, the columns are concatenated with the separator defined here. If only a single prompt column is selected, this setting is ignored.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
The column in the dataset containing the user prompt.
One column or multiple columns in the dataset containing the user prompt. If multiple columns are selected, the columns are concatenated with a separator defined in **Prompt Column Separator**.
4 changes: 3 additions & 1 deletion llm_studio/app_utils/hugging_face_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def get_model_card(cfg, model, repo_id) -> huggingface_hub.ModelCard:
text_answer_separator=cfg.dataset.text_answer_separator,
trust_remote_code=cfg.environment.trust_remote_code,
end_of_sentence=(
cfg._tokenizer_eos_token if cfg.dataset.add_eos_token_to_prompt else ""
cfg.tokenizer._tokenizer_eos_token
if cfg.dataset.add_eos_token_to_prompt
else ""
),
)
if cfg.problem_type not in NON_GENERATION_PROBLEM_TYPES:
Expand Down
2 changes: 1 addition & 1 deletion llm_studio/app_utils/sections/chat_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ async def answer_chat(q: Q) -> str:
else:
prev_message = prev_message[0]
if cfg.dataset.add_eos_token_to_answer:
prev_message += cfg._tokenizer_eos_token
prev_message += cfg.tokenizer._tokenizer_eos_token

full_prompt += prev_message
logger.info(f"Full prompt: {full_prompt}")
Expand Down
6 changes: 5 additions & 1 deletion llm_studio/app_utils/sections/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2070,7 +2070,11 @@ def get_experiment_summary_code_card(cfg) -> str:
)
text = text.replace(
"{{end_of_sentence}}",
str(cfg._tokenizer_eos_token) if cfg.dataset.add_eos_token_to_prompt else "",
(
str(cfg.tokenizer._tokenizer_eos_token)
if cfg.dataset.add_eos_token_to_prompt
else ""
),
)

text = text.replace("{{trust_remote_code}}", str(cfg.environment.trust_remote_code))
Expand Down
78 changes: 50 additions & 28 deletions llm_studio/app_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from sqlitedict import SqliteDict

from llm_studio.app_utils.db import Experiment
from llm_studio.python_configs.base import DefaultConfigProblemBase
from llm_studio.src import possible_values
from llm_studio.src.utils.config_utils import (
_get_type_annotation_error,
Expand Down Expand Up @@ -98,12 +99,12 @@ def find_free_port():


def start_process(
cfg: Any, gpu_list: List, process_queue: List, env_vars: Dict
cfg: DefaultConfigProblemBase, gpu_list: List, process_queue: List, env_vars: Dict
) -> subprocess.Popen:
"""Starts train.py for a given configuration setting

Args:
cfg: config
cfg: DefaultConfigProblemBase config
gpu_list: list of GPUs to use for the training
process_queue: list of processes to wait for before starting the training
env_vars: dictionary of ENV variables to pass to the training process
Expand Down Expand Up @@ -346,7 +347,7 @@ async def poll(self):
await self.update_ui()


def s3_download_coroutine(q, filename):
def s3_download_coroutine(q: Q, filename: str):
download_folder = f"{get_data_dir(q)}/tmp"
download_folder = get_valid_temp_data_folder(q, download_folder)

Expand All @@ -370,7 +371,7 @@ def extract_if_zip(file, actual_path):


async def s3_download(
q, bucket, filename, aws_access_key, aws_secret_key
q: Q, bucket, filename, aws_access_key, aws_secret_key
) -> Tuple[str, str]:
"""Downloads a file from s3

Expand Down Expand Up @@ -447,7 +448,7 @@ def azure_file_options(conn_string: str, container: str) -> List[str]:
return []


async def download_progress(q, title, seen_so_far, total_len):
async def download_progress(q: Q, title, seen_so_far, total_len):
if seen_so_far is not None and total_len is not None:
percentage = seen_so_far / total_len
value = percentage
Expand All @@ -469,7 +470,7 @@ async def download_progress(q, title, seen_so_far, total_len):


async def azure_download(
q: Any, conn_string: str, container: str, filename: str
q: Q, conn_string: str, container: str, filename: str
) -> Tuple[str, str]:
"""Downloads a file from azure

Expand Down Expand Up @@ -531,7 +532,7 @@ async def azure_download(
return azure_path, "".join(filename.split(".")[:-1])


async def local_download(q: Any, filename: str) -> Tuple[str, str]:
async def local_download(q: Q, filename: str) -> Tuple[str, str]:
"""Downloads a file from local path

Args:
Expand All @@ -558,7 +559,7 @@ async def local_download(q: Any, filename: str) -> Tuple[str, str]:


async def kaggle_download(
q: Any, command: str, kaggle_access_key: str, kaggle_secret_key: str
q: Q, command: str, kaggle_access_key: str, kaggle_secret_key: str
) -> Tuple[str, str]:
""" "Downloads a file from kaggle

Expand Down Expand Up @@ -769,6 +770,23 @@ def get_dataset(
return dataset, v


def escape_python_string(s: str) -> str:
"""Escapes a python string

Args:
s: string to escape

Returns:
Escaped string
"""

s = s.replace("\\", "\\\\")
s = s.replace("\n", "\\n")
s = s.replace("\t", "\\t")
s = s.replace("\r", "\\r")
return s


def get_ui_element(
k: str,
v: Any,
Expand Down Expand Up @@ -883,7 +901,7 @@ def get_ui_element(
ui.textbox(
name=pre + k,
label=title_label,
value=val,
value=escape_python_string(val),
required=False,
password=password,
tooltip=tooltip,
Expand Down Expand Up @@ -965,11 +983,11 @@ def get_ui_element(
return t


def get_dataset_elements(cfg: Any, q: Q) -> List:
def get_dataset_elements(cfg: DefaultConfigProblemBase, q: Q) -> List:
"""For a given configuration setting return the according dataset ui components.

Args:
cfg: configuration settings
cfg: DefaultConfigProblemBase configuration settings
q: Q

Returns:
Expand Down Expand Up @@ -1061,11 +1079,13 @@ def get_dataset_elements(cfg: Any, q: Q) -> List:
return items


def check_dependencies(cfg: Any, pre: str, k: str, q: Q, dataset_import: bool = False):
def check_dependencies(
cfg: DefaultConfigProblemBase, pre: str, k: str, q: Q, dataset_import: bool = False
):
"""Checks all dependencies for a given key

Args:
cfg: configuration settings
cfg: DefaultConfigProblemBase configuration settings
pre: prefix for client keys
k: key to be checked
q: Q
Expand Down Expand Up @@ -1107,7 +1127,7 @@ def check_dependencies(cfg: Any, pre: str, k: str, q: Q, dataset_import: bool =
return True


def is_visible(k: str, cfg: Any, q: Q) -> bool:
def is_visible(k: str, cfg: DefaultConfigProblemBase, q: Q) -> bool:
"""Returns a flag whether a given key should be visible on UI.

Args:
Expand Down Expand Up @@ -1145,7 +1165,7 @@ def get_grid_value(v: Any, type_annotation: Any) -> List[str]:


def get_ui_elements(
cfg: Any,
cfg: DefaultConfigProblemBase,
q: Q,
limit: Optional[List[str]] = None,
pre: str = "experiment/start",
Expand Down Expand Up @@ -1349,7 +1369,7 @@ def get_ui_elements(


def parse_ui_elements(
cfg: Any, q: Q, limit: Union[List, str] = "", pre: str = ""
cfg: DefaultConfigProblemBase, q: Q, limit: Union[List, str] = "", pre: str = ""
) -> Any:
"""Sets configuration settings with arguments from app

Expand Down Expand Up @@ -1891,11 +1911,13 @@ def set_grid_to_cfg(cfg: Any, grid: Dict[str, List]) -> Any:
return cfg


def start_experiment(cfg: Any, q: Q, pre: str, gpu_list: Optional[List] = None) -> None:
def start_experiment(
cfg: DefaultConfigProblemBase, q: Q, pre: str, gpu_list: Optional[List] = None
) -> None:
"""Starts an experiment

Args:
cfg: configuration settings
cfg: DefaultConfigProblemBase configuration settings
q: Q
pre: prefix for client keys
gpu_list: list of GPUs available
Expand Down Expand Up @@ -2022,7 +2044,7 @@ def dir_file_table(current_path: str) -> pd.DataFrame:
return pd.DataFrame({current_path: results})


def get_download_link(q, artifact_path):
def get_download_link(q: Q, artifact_path):
new_path = os.path.relpath(artifact_path, get_output_dir(q))
new_path = os.path.join(get_download_dir(q), new_path)
url_path = os.path.relpath(new_path, get_output_dir(q))
Expand Down Expand Up @@ -2148,17 +2170,17 @@ def remove_temp_files(q: Q):
os.remove(file)


def get_gpu_usage():
usage = 0.0
all_gpus = GPUtil.getGPUs()
def get_gpu_usage() -> float:
usage: float = 0.0
all_gpus: List[GPUtil.GPU] = GPUtil.getGPUs()
for gpu in all_gpus:
usage += gpu.load
usage += float(gpu.load)

usage /= len(all_gpus)
return usage * 100
return usage * 100.0


def get_single_gpu_usage(sig_figs=1, highlight=None):
def get_single_gpu_usage(sig_figs: int = 1, highlight: Optional[str] = None):
all_gpus = GPUtil.getGPUs()
items = []
for i, gpu in enumerate(all_gpus):
Expand All @@ -2184,11 +2206,11 @@ def get_single_gpu_usage(sig_figs=1, highlight=None):
return items


def copy_config(cfg: Any, q: Q) -> Any:
def copy_config(cfg: DefaultConfigProblemBase, q: Q) -> Any:
"""Makes a copy of the config

Args:
cfg: config object
cfg: DefaultConfigProblemBase config object
Returns:
copy of the config
"""
Expand Down Expand Up @@ -2217,7 +2239,7 @@ def make_label(title: str, appendix: str = "") -> str:
return label


def get_cfg_list_items(cfg) -> List:
def get_cfg_list_items(cfg: DefaultConfigProblemBase) -> List:
items = parse_cfg_dataclass(cfg)
x = []
for item in items:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class ConfigNLPCausalLMDataset(DefaultConfig):

system_column: str = "system"
prompt_column: Tuple[str, ...] = ("instruction", "input")
prompt_column_separator: str = "\n\n"
answer_column: str = "output"
parent_id_column: str = "parent_id"

Expand Down
2 changes: 1 addition & 1 deletion llm_studio/src/augmentations/nlp_aug.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def forward(self, batch: Dict) -> Dict:
.bool()
# & special_mask
).bool()
input_ids[mask] = self.cfg._tokenizer_mask_token_id
input_ids[mask] = self.cfg.tokenizer._tokenizer_mask_token_id
batch["input_ids"] = input_ids.clone()
batch["attention_mask"][mask] = 0
if batch["labels"].shape[1] == batch["input_ids"].shape[1]:
Expand Down
2 changes: 1 addition & 1 deletion llm_studio/src/datasets/conversation_chain_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(
# Do not set self.cfg = cfg, as ConversationChainHandler
# will be used with PatchedAttribute context manager.
self.conversation_chain_ids = self.get_conversation_chain_ids(cfg, df)
self.prompts = get_texts(df, cfg, separator="")
self.prompts = get_texts(df, cfg)
self.answers = self.get_answers(df, cfg)
self.systems = self.get_systems(cfg, df)

Expand Down
6 changes: 3 additions & 3 deletions llm_studio/src/datasets/text_causal_language_modeling_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def parse_prompt(cfg: Any, prompt: str):
f"{codecs.decode(cfg.dataset.text_prompt_start, 'unicode_escape')}{prompt}"
)
if cfg.dataset.add_eos_token_to_prompt:
prompt += cfg._tokenizer_eos_token
prompt += cfg.tokenizer._tokenizer_eos_token
prompt = (
f"{prompt}"
f"{codecs.decode(cfg.dataset.text_answer_separator, 'unicode_escape')}"
Expand All @@ -120,7 +120,7 @@ def parse_prompt(cfg: Any, prompt: str):
@staticmethod
def parse_answer(cfg: Any, answer: str):
if cfg.dataset.add_eos_token_to_answer:
answer += cfg._tokenizer_eos_token
answer += cfg.tokenizer._tokenizer_eos_token
return answer

@staticmethod
Expand All @@ -132,7 +132,7 @@ def parse_system(cfg: Any, system: str):
f"{codecs.decode(cfg.dataset.text_system_start, 'unicode_escape')}{system}"
)
if cfg.dataset.add_eos_token_to_system:
system += cfg._tokenizer_eos_token
system += cfg.tokenizer._tokenizer_eos_token
return system

@staticmethod
Expand Down
Loading