Vietnamese-Function-Calling-Test
收藏Vietnamese Function Calling Benchmark
数据集详情
- 数据大小: 2899个单轮函数调用样本
- 领域:
- 银行
- 保险
- 旅行
- 教育
- 健康
- 招聘
- 车辆控制
- 购物
- 工作
- 汽车服务
- 函数数量: 159个函数
模型评估
| 模型名称 | 模型大小 | 函数名称准确率 (%) | 完全匹配准确率 (%) |
|---|---|---|---|
| phamhai/Llama-3.2-3B-Instruct-Frog | ~3B | 95.79 | 51.05 |
| Gemini-1.5-Pro | --- | 96.96 | 55.16 |
| Gemini-1.5-Flash | --- | 97.10 | 51.64 |
| Gemini-1.5-Flash-8B | --- | 97.38 | 64.75 |
| gpt-4o-2024-08-06 | --- | 94.38 | 52.88 |
| arcee-ai/Arcee-VyLinh | ~3B | --- | --- |
| phamhai/Llama-3.2-3B-Instruct-Frog-Pro | ~3B | 98.12 | 56.38 |
评估代码
加载模型和数据集
python import torch from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset
model_path = "phamhai/Llama-3.2-3B-Instruct-Frog" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path, force_download=True, device_map="auto", torch_dtype=torch.bfloat16)
dataset = load_dataset(phamhai/Vietnamese-Function-Calling-Test)
Frog模型推理代码
python from tqdm import tqdm
def infer(text, tools): messages = [ {"role": "system", "content": Bạn là một trợ lý hữu ích với khả năng truy cập vào các hàm sau. Hãy chọn một trong các công cụ được cung cấp dưới đây để sử dụng cho việc trả lời câu hỏi của người dùng - %s % , .join(tools)}, {"role": "user", "content": text}] tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
tokenized_chat = tokenized_chat.to(cuda:0)
outputs = model.generate(tokenized_chat, max_new_tokens=128)
return tokenizer.decode(outputs[0]).split(<functioncall> )[-1].replace(<|eot_id|>, )
preds = [] golds = []
for d in tqdm(dataset[test]): golds.append(d[output]) preds.append(infer(d[input_text], d[tools]))
Gemini-1.5-Pro推理代码
python import google.generativeai as genai import os from google.generativeai.types import content_types from collections.abc import Iterable from tqdm import tqdm
def tool_config_from_mode(mode: str, fns: Iterable[str] = ()): return content_types.to_tool_config( {"function_calling_config": {"mode": mode}} )
tool_config = tool_config_from_mode("any")
genai.configure(api_key="")
def infer_gemini_with_tools(text, tools): model = genai.GenerativeModel("gemini-1.5-pro")
prepare_tools_for_gem = []
for tool in tools:
tool = eval(tool)
if len(tool[parameters][properties]) == 0:
tool.pop(parameters, None)
prepare_tools_for_gem.append(tool)
i = 0
while True:
try:
i += 1
response = model.generate_content(
text,
tools=[{"function_declarations": prepare_tools_for_gem}],
generation_config=genai.GenerationConfig(
max_output_tokens=1000,
temperature=0.1,
),
tool_config=tool_config
)
if "function_call" in response.candidates[0].content.parts[0]:
return {
name: response.candidates[0].content.parts[0].function_call.name,
arguments: dict(response.candidates[0].content.parts[0].function_call.args)
}
else:
return {
name: response.candidates[0].content.parts[0].text,
arguments:
}
except Exception as e:
print(e)
if i > 10:
return response
preds = [] golds = []
for d in tqdm(dataset[test]): golds.append(d[output]) preds.append(infer_gemini_with_tools(d[input_text], d[tools]))
OpenAIs GPT-4o推理代码
python import openai from openai import OpenAI import json
client = OpenAI(api_key="")
def infer_gpt4o_with_tools(text, tools): prepare_tools_for_gpt = [] for tool in tools: tool = eval(tool) if len(tool[parameters][properties]) == 0: tool.pop(parameters, None) prepare_tools_for_gpt.append({ "type": "function", "function": tool, })
messages = [{"role": "user", "content": text}]
c = 0
while True:
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=prepare_tools_for_gpt,
temperature=0,
tool_choice="required",
)
response_message = response.choices[0].message
if response_message.tool_calls:
tool_call = response_message.tool_calls[0]
return {
name: tool_call.function.name,
arguments: dict(json.loads(tool_call.function.arguments))
}
else:
return {
name: not using tool,
arguments:
}
except Exception as e:
print(e)
c += 1
if c > 3:
return {
name: not using tool,
arguments:
}
preds = [] golds = []
for d in tqdm(dataset[test]): golds.append(d[output]) preds.append(infer_gpt4o_with_tools(d[input_text], d[tools]))
获取准确率代码
python import json with open(./test_results.json, r) as f_r: preds, golds = json.load(f_r)
correct_fc_name = 0 correct_full_fc = 0
for i in range(len(preds)): try: if type(preds[i]) == str and not preds[i].endswith("}}"): preds[i] = preds[i] + } p = eval(preds[i]) g = eval(golds[i].replace(<functioncall> , )) if p[name] == g[name]: correct_fc_name += 1 if p == g: correct_full_fc += 1 except: pass
print("Accuracy in classifying into the correct function name: ", correct_fc_name / len(preds)) print("Accuracy in classifying into the correct function and all associated parameters: ", correct_full_fc / len(preds))
联系作者
- 邮箱: phamhuuhai1402@gmail.com




