Codelab 1/2 from day one. The code lab is here in Kaggle.
"""
Evaluation and Structured Output
Google Gen AI 5-Day Intensive Course
Host: Kaggle
Day: 1
Kaggle: https://www.kaggle.com/code/markishere/day-1-evaluation-and-structured-output
"""
import enum
import os
from google import genai
from google.api_core import retry
from google.genai import types
from IPython.display import Markdown, display
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
# Automated retry
is_retriable = lambda e: (
isinstance(e, genai.errors.APIError) and e.code in {429, 503}
)
genai.models.Models.generate_content = retry.Retry(predicate=is_retriable)(
genai.models.Models.generate_content
)
# if not hasattr(genai.models.Models.generate_content, '__wrapped__'):
# genai.models.Models.generate_content = retry.Retry(
# predicate=is_retriable)(genai.models.Models.generate_content)
# Evaluation
# Understand model performance
# Get the file locally first
# !wget -nv -O gemini.pdf https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf
document_file = client.files.upload(file="/Users/renise/Documents/Python/gen_ai/day_one/gemini.pdf")
print("\n")
print(document_file)
print("\n")
print("\nSummarize a document\n")
# Summarize a document
def summarize_doc(request: str) -> str:
"""Execute the request on the uploaded document."""
# Set the temperature low to stabilize the output.
config = types.GenerateContentConfig(temperature=0.0)
response = client.models.generate_content(
model="gemini-2.0-flash",
config=config,
contents=[request, document_file],
)
return response.text
request = "Tell me about the training process used here."
summary = summarize_doc(request)
# display(Markdown(summary + "\n-----"))
print("\n\n")
# Define an evaluator
SUMMARY_PROMPT = """\
# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
We will provide you with the user input and an AI-generated responses.
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric.
# Evaluation
## Metric Definition
You will be assessing summarization quality, which measures the overall ability to summarize text. Pay special attention to length constraints, such as in X words or in Y sentences. The instruction for performing a summarization task and the context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.
## Criteria
Instruction following: The response demonstrates a clear understanding of the summarization task instructions, satisfying all of the instruction's requirements.
Groundedness: The response contains information included only in the context. The response does not reference any outside information.
Conciseness: The response summarizes the relevant details in the original text without a significant loss in key information without being too verbose or terse.
Fluency: The response is well-organized and easy to read.
## Rating Rubric
5: (Very good). The summary follows instructions, is grounded, is concise, and fluent.
4: (Good). The summary follows instructions, is grounded, concise, and fluent.
3: (Ok). The summary mostly follows instructions, is grounded, but is not very concise and is not fluent.
2: (Bad). The summary is grounded, but does not follow the instructions.
1: (Very bad). The summary is not grounded.
## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness, conciseness, and verbosity according to the criteria.
STEP 2: Score based on the rubric.
# User Inputs and AI-generated Response
## User Inputs
### Prompt
{prompt}
## AI-generated Response
{response}
"""
# Define a structured enum class to capture the result.
class SummaryRating(enum.Enum):
VERY_GOOD = 5
GOOD = 4
OK = 3
BAD = 2
VERY_BAD = 1
def eval_summary(prompt, ai_response):
"""Evaluate the generated summary against the prompt."""
chat = client.chats.create(model="gemini-2.0-flash")
# Generate the full text response
response = chat.send_message(
message=SUMMARY_PROMPT.format(prompt=prompt, response=ai_response)
)
verbose_eval = response.text
# Coerce into desired structure
structured_output_config = types.GenerateContentConfig(
response_mime_type="text/x.enum",
response_schema=SummaryRating
)
response = chat.send_message(
message="Convert the final score.",
config=structured_output_config
)
structured_eval = response.parsed
return verbose_eval, structured_eval
text_eval, struct_eval = eval_summary(
prompt=[request, document_file],
ai_response=summary
)
Markdown(text_eval)
# Play with the summary prompt
new_prompt = "Explain like I'm 5 the training process"
# Try:
# ELI5 the training process
# Summarise the needle/haystack evaluation technique in 1 line
# Describe the model architecture to someone with a civil engineering degree
# What is the best LLM?
if not new_prompt:
raise ValueError("Try setting a new summarization prompt.")
def run_and_eval_summary(prompt):
"""Generate and evaluate the summary using the new prompt."""
summary = summarize_doc(new_prompt)
display(Markdown(summary + "\n-----"))
text, struct = eval_summary([new_prompt, document_file], summary)
display(Markdown(text + "\n-----"))
print(struct)
run_and_eval_summary(new_prompt)