def extract_tool_calls(response):
"""Extract tool calls from the new response format"""
tool_calls = []
if hasattr(response, 'output') and response.output:
for output_item in response.output:
if output_item.type == 'function_call':
tool_calls.append(output_item.name)
return tool_calls
# Initialize a new experiment
evaluation = langwatch.evaluation.init("tool-calling-evaluation")
# Create a DataFrame from the test data for easier processing
test_df = pd.DataFrame([
{
"query": test_item[0],
"expected": [tool.__name__ for tool in test_item[1]]
}
for test_item in tests
])
# Wrap your loop with evaluation.loop(), and iterate as usual
results = []
for idx, row in evaluation.loop(test_df.iterrows()):
# Run your model
result = await process_user_query(row["query"])
# Extract tool calls
actual_tools = extract_tool_calls(result["response"])
# Calculate metrics
precision = calculate_precision(actual_tools, row["expected"])
recall = calculate_recall(actual_tools, row["expected"])
# Log metrics for this sample
evaluation.log("precision", index=idx, score=precision)
evaluation.log("recall", index=idx, score=recall)
# Include additional data for debugging
evaluation.log("tool_selection",
index=idx,
score=recall, # Using recall as the primary score
data={
"query": row["query"],
"expected_tools": row["expected"],
"actual_tools": actual_tools,
"response_time": round(result["time"], 2)
})
# Store results for local analysis
results.append({
"query": row["query"],
"expected": row["expected"],
"actual": actual_tools,
"time": round(result["time"], 2),
"precision": precision,
"recall": recall
})
# Create DataFrame for local analysis
df = pd.DataFrame(results)
df