From b9e83c8758fa4a332f76ac5c72eaff31076073d0 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Mon, 13 Nov 2023 09:48:36 -0500 Subject: [PATCH 1/7] Add prometheus instrumentation --- .../ui/gradio/gradio-hftgi-rag-redis/app.py | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py index fda5bb5..8d8d766 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py @@ -15,7 +15,7 @@ from langchain.llms import HuggingFaceTextGenInference from langchain.prompts import PromptTemplate from langchain.vectorstores.redis import Redis -from prometheus_client import start_http_server, Counter +from prometheus_client import start_http_server, Counter, Histogram, Gauge load_dotenv() @@ -24,7 +24,7 @@ APP_TITLE = os.getenv('APP_TITLE', 'Talk with your documentation') INFERENCE_SERVER_URL = os.getenv('INFERENCE_SERVER_URL') -MAX_NEW_TOKENS = int(os.getenv('MAX_NEW_TOKENS', 512)) +MAX_NEW_TOKENS = int(os.getenv('MAX_NEW_TOKENS', 100)) TOP_K = int(os.getenv('TOP_K', 10)) TOP_P = float(os.getenv('TOP_P', 0.95)) TYPICAL_P = float(os.getenv('TYPICAL_P', 0.95)) @@ -33,16 +33,20 @@ REDIS_URL = os.getenv('REDIS_URL') REDIS_INDEX = os.getenv('REDIS_INDEX') - +TIMEOUT = int(os.getenv('TIMEOUT', 30)) # Start Prometheus metrics server start_http_server(8000) -# Create a counter metric -FEEDBACK_COUNTER = Counter("feedback_stars", "Number of feedbacks by stars", ["stars"]) +# Create metric +FEEDBACK_COUNTER = Counter("feedback_stars", "Number of feedbacks by stars", ["stars", "model_id"]) MODEL_USAGE_COUNTER = Counter('model_usage', 'Number of times a model was used', ['model_id']) +REQUEST_TIME = Gauge('request_duration_seconds', 'Time spent processing a request', ['model_id']) +SATISFACTION = Gauge('satisfaction_rating', 'User satisfaction rating', ['rating']) +TIMEOUTS = Counter('timeouts_total', 'Total number of request timeouts', ['model_id']) + model_id = "" -client = Client(base_url=INFERENCE_SERVER_URL) +client = Client(base_url=INFERENCE_SERVER_URL,timeout=TIMEOUT) # Streaming implementation class QueueCallback(BaseCallbackHandler): @@ -67,20 +71,28 @@ def remove_source_duplicates(input_list): def stream(input_text) -> Generator: global model_id - # Create a Queue job_done = object() # Create a function to call - this will run in a thread def task(): resp = qa_chain({"query": input_text}) - sources = remove_source_duplicates(resp['source_documents']) - + sources = remove_source_duplicates(resp['source_documents']) input = str(input_text) - response = client.generate(input, max_new_tokens=MAX_NEW_TOKENS) - text = response.generated_text - model_id = response.model_id + start_time = time.perf_counter() # start and end time to get the precise timing of the request + + try: + response = client.generate(input, max_new_tokens=MAX_NEW_TOKENS) + end_time = time.perf_counter() + model_id = response.model_id + # Record successful request time + REQUEST_TIME.labels(model_id=model_id).set(end_time - start_time) + except TimeoutError: # or whatever exception your client throws on timeout + end_time = time.perf_counter() + TIMEOUTS.info({'model_id': model_id, 'timeout_duration': str(end_time - start_time), 'input_text': input}) + q.put({"model_id": response.model_id}) + q.put({"generated_text": response.generated_text}) print("MODEL ID IS:",model_id) print("Question:",input) if len(sources) != 0: @@ -88,6 +100,7 @@ def task(): for source in sources: q.put("* " + str(source) + "\n") q.put(job_done) + print("Saving it...") # Create a thread and start the function t = Thread(target=task) @@ -104,9 +117,11 @@ def task(): if isinstance(next_token, dict) and 'model_id' in next_token: model_id = next_token['model_id'] MODEL_USAGE_COUNTER.labels(model_id=model_id).inc() + if isinstance(next_token, dict) and 'generated_text' in next_token: + generated_text = next_token['generated_text'] elif isinstance(next_token, str): content += next_token - yield next_token, content, model_id + yield next_token, generated_text, model_id except Empty: continue @@ -163,18 +178,19 @@ def task(): return_source_documents=True ) -# Gradio implementation def ask_llm(message, history): - for next_token, content, model_id in stream(message): + for next_token, generated_text, model_id in stream(message): print(model_id) - yield f"{content}\n\nModel ID: {model_id}" + model_id_box.update(value=model_id) + yield f"{generated_text}\n\nModel ID: {model_id}" +# Gradio implementation with gr.Blocks(title="HatBot", css="footer {visibility: hidden}") as demo: input_box = gr.Textbox(label="Your Question") output_answer = gr.Textbox(label="Answer", readonly=True) - + model_id_box = gr.Textbox(visible=False) # will hold the model_id gr.Interface( fn=ask_llm, @@ -194,7 +210,8 @@ def ask_llm(message, history): def get_feedback(star): print("Rating: " + star) # Increment the counter based on the star rating received - FEEDBACK_COUNTER.labels(stars=str(star)).inc() + FEEDBACK_COUNTER.labels(stars=str(star), model_id=model_id).inc() + SATISFACTION.labels(rating=star).set(1) return f"Received {star} star feedback. Thank you!" From ee685ec65f29b2f14df8945e8ea941664bf89d64 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Mon, 13 Nov 2023 10:19:09 -0500 Subject: [PATCH 2/7] Modify the streaming calls to get model_id and source_docs using langchain --- .../ui/gradio/gradio-hftgi-rag-redis/app.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py index 8d8d766..544aeac 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py @@ -16,6 +16,7 @@ from langchain.prompts import PromptTemplate from langchain.vectorstores.redis import Redis from prometheus_client import start_http_server, Counter, Histogram, Gauge +import requests load_dotenv() @@ -68,6 +69,15 @@ def remove_source_duplicates(input_list): unique_list.append(item.metadata['source']) return unique_list +def get_model_info(): + response = requests.get(INFERENCE_SERVER_URL + "/info") + json_response = response.json() + print(json_response) + model_id = json_response['model_id'] # Extract the model_id + print("Model ID:", model_id) # Print the model_id + return model_id # Return the model_id instead of the whole JSON + + def stream(input_text) -> Generator: global model_id @@ -82,17 +92,17 @@ def task(): start_time = time.perf_counter() # start and end time to get the precise timing of the request try: - response = client.generate(input, max_new_tokens=MAX_NEW_TOKENS) + # response = client.generate(input, max_new_tokens=MAX_NEW_TOKENS) + model_id = get_model_info() end_time = time.perf_counter() - model_id = response.model_id # Record successful request time REQUEST_TIME.labels(model_id=model_id).set(end_time - start_time) except TimeoutError: # or whatever exception your client throws on timeout end_time = time.perf_counter() TIMEOUTS.info({'model_id': model_id, 'timeout_duration': str(end_time - start_time), 'input_text': input}) - q.put({"model_id": response.model_id}) - q.put({"generated_text": response.generated_text}) + q.put({"model_id": model_id}) + # q.put({"generated_text": resp.generated_text}) print("MODEL ID IS:",model_id) print("Question:",input) if len(sources) != 0: @@ -117,11 +127,11 @@ def task(): if isinstance(next_token, dict) and 'model_id' in next_token: model_id = next_token['model_id'] MODEL_USAGE_COUNTER.labels(model_id=model_id).inc() - if isinstance(next_token, dict) and 'generated_text' in next_token: - generated_text = next_token['generated_text'] + # if isinstance(next_token, dict) and 'generated_text' in next_token: + # generated_text = next_token['generated_text'] elif isinstance(next_token, str): content += next_token - yield next_token, generated_text, model_id + yield next_token, content, model_id except Empty: continue @@ -179,10 +189,10 @@ def task(): ) def ask_llm(message, history): - for next_token, generated_text, model_id in stream(message): + for next_token, content, model_id in stream(message): print(model_id) model_id_box.update(value=model_id) - yield f"{generated_text}\n\nModel ID: {model_id}" + yield f"{content}\n\nModel ID: {model_id}" # Gradio implementation From 927bc5cb2d968a17c0775553b785e0b5c34fe432 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Tue, 21 Nov 2023 16:08:30 -0500 Subject: [PATCH 3/7] Code refinement --- .../ui/gradio/gradio-hftgi-rag-redis/app.py | 51 +++++++------------ 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py index 544aeac..2568e3a 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/app.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/app.py @@ -42,8 +42,6 @@ FEEDBACK_COUNTER = Counter("feedback_stars", "Number of feedbacks by stars", ["stars", "model_id"]) MODEL_USAGE_COUNTER = Counter('model_usage', 'Number of times a model was used', ['model_id']) REQUEST_TIME = Gauge('request_duration_seconds', 'Time spent processing a request', ['model_id']) -SATISFACTION = Gauge('satisfaction_rating', 'User satisfaction rating', ['rating']) -TIMEOUTS = Counter('timeouts_total', 'Total number of request timeouts', ['model_id']) model_id = "" @@ -69,22 +67,13 @@ def remove_source_duplicates(input_list): unique_list.append(item.metadata['source']) return unique_list -def get_model_info(): - response = requests.get(INFERENCE_SERVER_URL + "/info") - json_response = response.json() - print(json_response) - model_id = json_response['model_id'] # Extract the model_id - print("Model ID:", model_id) # Print the model_id - return model_id # Return the model_id instead of the whole JSON - def stream(input_text) -> Generator: global model_id - # Create a Queue + # Create queue job_done = object() - # Create a function to call - this will run in a thread def task(): resp = qa_chain({"query": input_text}) sources = remove_source_duplicates(resp['source_documents']) @@ -92,17 +81,17 @@ def task(): start_time = time.perf_counter() # start and end time to get the precise timing of the request try: - # response = client.generate(input, max_new_tokens=MAX_NEW_TOKENS) - model_id = get_model_info() + response = requests.get(INFERENCE_SERVER_URL + "/info") + json_response = response.json() + print(json_response) + model_id = json_response['model_id'] end_time = time.perf_counter() # Record successful request time REQUEST_TIME.labels(model_id=model_id).set(end_time - start_time) except TimeoutError: # or whatever exception your client throws on timeout end_time = time.perf_counter() - TIMEOUTS.info({'model_id': model_id, 'timeout_duration': str(end_time - start_time), 'input_text': input}) q.put({"model_id": model_id}) - # q.put({"generated_text": resp.generated_text}) print("MODEL ID IS:",model_id) print("Question:",input) if len(sources) != 0: @@ -126,9 +115,7 @@ def task(): break if isinstance(next_token, dict) and 'model_id' in next_token: model_id = next_token['model_id'] - MODEL_USAGE_COUNTER.labels(model_id=model_id).inc() - # if isinstance(next_token, dict) and 'generated_text' in next_token: - # generated_text = next_token['generated_text'] + MODEL_USAGE_COUNTER.labels(model_id=model_id).inc() elif isinstance(next_token, str): content += next_token yield next_token, content, model_id @@ -196,33 +183,33 @@ def ask_llm(message, history): # Gradio implementation -with gr.Blocks(title="HatBot", css="footer {visibility: hidden}") as demo: +with gr.Blocks(title="HatBot") as demo: + + with gr.Column(class_name="column-class"): + project_box = gr.Textbox(label="Your Project") + customer_box = gr.Textbox(label="Customer") + input_box = gr.Textbox(label="Your Question") + submit_button = gr.Button("Submit") + output_answer = gr.Textbox(label="Answer", readonly=True) - input_box = gr.Textbox(label="Your Question") - output_answer = gr.Textbox(label="Answer", readonly=True) model_id_box = gr.Textbox(visible=False) # will hold the model_id - gr.Interface( + submit_button.click( fn=ask_llm, inputs=[input_box], - outputs=[output_answer], - clear_btn=None, - retry_btn=None, - undo_btn=None, - stop_btn=None, - description=APP_TITLE - ) + outputs=[output_answer] + ) radio = gr.Radio(["1", "2", "3", "4", "5"], label="Star Rating") output = gr.Textbox(label="Output Box") + download_button = gr.Button("Download") + @radio.input(inputs=radio, outputs=output) def get_feedback(star): print("Rating: " + star) # Increment the counter based on the star rating received FEEDBACK_COUNTER.labels(stars=str(star), model_id=model_id).inc() - SATISFACTION.labels(rating=star).set(1) - return f"Received {star} star feedback. Thank you!" From 3b99965d945087e52b4a4259b7d95366fe60f1fa Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Tue, 21 Nov 2023 16:09:39 -0500 Subject: [PATCH 4/7] Add loop to send 10 requests and modify script according to new UI --- .../gradio-hftgi-rag-redis/chatbot_test.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py new file mode 100644 index 0000000..3a374aa --- /dev/null +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py @@ -0,0 +1,46 @@ +# Import necessary libraries +import time +import json +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from selenium.common.exceptions import TimeoutException +import random + +driver = webdriver.Firefox() +driver.get("https://canary-gradio-vectordb.apps.ai-dev01.kni.syseng.devcluster.openshift.com") +driver.set_window_size(1084, 811) +timeout = 10 + +for user in range(10): + element_present = EC.presence_of_element_located((By.CSS_SELECTOR, "#component-0 .scroll-hide")) + WebDriverWait(driver, timeout).until(element_present) + + # User enters a question + project_input = driver.find_element(By.CSS_SELECTOR, "#component-3 .scroll-hide") + project_input.clear() # Clearing any previous input + project_input.send_keys(f"User {user + 1}: OpenShift AI") + customer_input = driver.find_element(By.CSS_SELECTOR, "#component-4 .scroll-hide") + customer_input.clear() # Clearing any previous input + customer_input.send_keys(f"User {user + 1}: Accenture") + question_input = driver.find_element(By.CSS_SELECTOR, "#component-5 .scroll-hide") + question_input.clear() # Clearing any previous input + question_input.send_keys(f"User {user + 1}: What is OpenShift AI?") + driver.find_element(By.ID, "component-6").click() + + label_list=[1,2,3,4,5] + random_num = random.choice(label_list) + labelname=str(random_num)+'-radio-label' + label_id="label[data-testid='"+labelname+"']" + + # # Wait for and click on the feedback element + # label_id = "label[data-testid='2-radio-label']" + WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, label_id))).click() + time.sleep(2) # Adding a delay for better simulation of user interaction + +# Close the browser after the loop completes +#driver.quit() From 35a58d255f266e2e210873b8b93fff364bded070 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Wed, 22 Nov 2023 12:23:47 -0500 Subject: [PATCH 5/7] Script refinement --- .../gradio-hftgi-rag-redis/chatbot_test.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py index bd08933..27be4bd 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py @@ -1,13 +1,10 @@ # Import necessary libraries -# Generated by Selenium IDE -# import pytest import time import json from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.desired_capabilities import DesiredCapabilities @@ -40,36 +37,9 @@ labelname=str(random_num)+'-radio-label' label_id="label[data-testid='"+labelname+"']" - # # Wait for and click on the feedback element # label_id = "label[data-testid='2-radio-label']" WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, label_id))).click() time.sleep(2) # Adding a delay for better simulation of user interaction # Close the browser after the loop completes #driver.quit() - -# read yaml file -# loop -driver = webdriver.Firefox() -vars = {} - -driver.get("https://gradio-hftgi-rag-redis-vectordb.apps.ai-dev01.kni.syseng.devcluster.openshift.com") -driver.set_window_size(1084, 811) -timeout = 10 -try: - # element_present = EC.presence_of_element_located((By.CSS_SELECTOR, "#chatinput .scroll-hide")) - element_present = EC.presence_of_element_located((By.CSS_SELECTOR, "#component-1 .scroll-hide")) - WebDriverWait(driver, timeout).until(element_present) -except TimeoutException: - print("Timed out waiting for page to load") - -driver.find_element(By.CSS_SELECTOR, "#component-1 .scroll-hide") -driver.find_element(By.CSS_SELECTOR, "#component-1 .scroll-hide").send_keys("hi how are you") -driver.find_element(By.ID, "component-12").click() - -label_id="label[data-testid='2-radio-label']" -x = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, label_id))).click() - -# if needed add some delay -# driver.quit() -# end loop From 0e0f4ed3bf6e1cbcd6120b5ddce500d089897913 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Wed, 22 Nov 2023 12:36:41 -0500 Subject: [PATCH 6/7] Add more negative star rating in the list --- examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py index 27be4bd..ba8cce2 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py @@ -32,7 +32,7 @@ question_input.send_keys(f"User {user + 1}: What is OpenShift AI?") driver.find_element(By.ID, "component-6").click() - label_list=[1,2,3,4,5] + label_list=[1,1,1,1,1,1,1,1,2,3,4,5] random_num = random.choice(label_list) labelname=str(random_num)+'-radio-label' label_id="label[data-testid='"+labelname+"']" From ac51717ae6619efaf085461fb8af2aac81f1c236 Mon Sep 17 00:00:00 2001 From: Twinkll Sisodia Date: Tue, 28 Nov 2023 10:38:32 -0500 Subject: [PATCH 7/7] Code refinement with correct random list --- examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py index ba8cce2..09029e3 100644 --- a/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py +++ b/examples/ui/gradio/gradio-hftgi-rag-redis/chatbot_test.py @@ -16,7 +16,7 @@ driver.set_window_size(1084, 811) timeout = 10 -for user in range(10): +for user in range(20): element_present = EC.presence_of_element_located((By.CSS_SELECTOR, "#component-0 .scroll-hide")) WebDriverWait(driver, timeout).until(element_present) @@ -32,7 +32,7 @@ question_input.send_keys(f"User {user + 1}: What is OpenShift AI?") driver.find_element(By.ID, "component-6").click() - label_list=[1,1,1,1,1,1,1,1,2,3,4,5] + label_list=[1,2,3,4,5] random_num = random.choice(label_list) labelname=str(random_num)+'-radio-label' label_id="label[data-testid='"+labelname+"']"