From a7e8a8aebc4572b003aa369fa12794aafeebe8cc Mon Sep 17 00:00:00 2001 From: Jie JW Wu <122728498+jie-jw-wu@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:16:59 -0800 Subject: [PATCH 1/2] Create evaluate_fine_tuned_model.py --- fine-tuning/evaluate_fine_tuned_model.py | 61 ++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 fine-tuning/evaluate_fine_tuned_model.py diff --git a/fine-tuning/evaluate_fine_tuned_model.py b/fine-tuning/evaluate_fine_tuned_model.py new file mode 100644 index 0000000..c6b0dd6 --- /dev/null +++ b/fine-tuning/evaluate_fine_tuned_model.py @@ -0,0 +1,61 @@ +import argparse +import os +import torch +from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq +from datasets import load_dataset + +def evaluate_model(model_path, dataset_path, output_dir): + # Load the model and tokenizer + model = AutoModelForCausalLM.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) + + # Load the dataset + data = load_dataset('json', data_files=dataset_path) + + # Tokenize the dataset + def tokenize_function(samples): + concatenated_text = samples['problem'] + samples['answer'] + result = tokenizer( + concatenated_text, + truncation=True, + max_length=512, + padding=False, + return_tensors=None, + ) + result["labels"] = result["input_ids"].copy() + return result + + tokenized_data = data.map(tokenize_function) + val_dataset = tokenized_data['train'].train_test_split(test_size=0.2, seed=42)['test'] + + # Define the Trainer + training_args = TrainingArguments( + per_device_eval_batch_size=8, + output_dir=output_dir, + remove_unused_columns=True, + ) + + trainer = Trainer( + model=model, + eval_dataset=val_dataset, + args=training_args, + data_collator=DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + + # Evaluate the model + results = trainer.evaluate() + print(results) + + return results + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model_path', type=str, required=True, help='Path to the fine-tuned model') + parser.add_argument('--dataset_path', type=str, required=True, help='Path to the dataset') + parser.add_argument('--output_dir', type=str, required=True, help='Directory to save evaluation results') + + args = parser.parse_args() + + evaluate_model(args.model_path, args.dataset_path, args.output_dir) From 238f2747324b93742a6ff48d94e23d904c2ec6b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20=E2=80=9CJW=E2=80=9D=20Wu?= <122728498+jie-jw-wu@users.noreply.github.com> Date: Tue, 25 Feb 2025 16:15:44 -0800 Subject: [PATCH 2/2] Update evaluate_fine_tuned_model.py --- fine-tuning/evaluate_fine_tuned_model.py | 118 +++++++++++++++++------ 1 file changed, 90 insertions(+), 28 deletions(-) diff --git a/fine-tuning/evaluate_fine_tuned_model.py b/fine-tuning/evaluate_fine_tuned_model.py index c6b0dd6..c01c517 100644 --- a/fine-tuning/evaluate_fine_tuned_model.py +++ b/fine-tuning/evaluate_fine_tuned_model.py @@ -1,19 +1,35 @@ import argparse import os import torch -from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments from datasets import load_dataset +from sklearn.metrics import accuracy_score, precision_recall_fscore_support + +def compute_metrics(pred): + labels = pred.label_ids + preds = pred.predictions.argmax(-1) + precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') + acc = accuracy_score(labels, preds) + return { + 'accuracy': acc, + 'f1': f1, + 'precision': precision, + 'recall': recall + } + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_name_or_path', type=str, required=True, help='Path to the model') + parser.add_argument('--dataset_path', type=str, required=True, help='Path to the dataset') + parser.add_argument('--tokenize_version', type=int, choices=[1, 2, 3, 4], required=True, help='Select which tokenize function to use: 1, 2, 3, or 4') + args = parser.parse_args() -def evaluate_model(model_path, dataset_path, output_dir): # Load the model and tokenizer - model = AutoModelForCausalLM.from_pretrained(model_path) - tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - # Load the dataset - data = load_dataset('json', data_files=dataset_path) - - # Tokenize the dataset - def tokenize_function(samples): + # Define the tokenize function based on the version + def tokenize_v1(samples): concatenated_text = samples['problem'] + samples['answer'] result = tokenizer( concatenated_text, @@ -24,38 +40,84 @@ def tokenize_function(samples): ) result["labels"] = result["input_ids"].copy() return result - - tokenized_data = data.map(tokenize_function) + + def tokenize_v2(samples): + concatenated_text = samples['problem'] + samples['answer'] + result = tokenizer( + concatenated_text, + truncation=True, + max_length=512, + padding=False, + return_tensors=None, + ) + problem_tokens = tokenizer(samples['problem'], truncation=True, max_length=512, padding=False, return_tensors=None)["input_ids"] + answer_tokens = tokenizer(samples['answer'], truncation=True, max_length=512, padding=False, return_tensors=None)["input_ids"] + answer_start_idx = len(problem_tokens) + labels = [-100] * len(result["input_ids"]) + labels[answer_start_idx:answer_start_idx + len(answer_tokens)] = result["input_ids"][answer_start_idx:answer_start_idx + len(answer_tokens)] + result["labels"] = labels + return result + + def tokenize_v3(samples): + concatenated_text = samples['problem'] + samples['answer'] + samples['type'] + result = tokenizer( + concatenated_text, + truncation=True, + max_length=512, + padding=False, + return_tensors=None, + ) + result["labels"] = result["input_ids"].copy() + return result + + def tokenize_v4(samples): + QPROMPT = "You are an expert software developer who writes high quality code. With below information, please either generate Python3 code (Respond directly with code only with markdown), or ask clarifying questions:\n" + if samples['type'] == "Original": + APROMPT = "This is a clear problem requiring no clarifications. Let's generate the required Python3 code directly in markdown." + else: + APROMPT = "I have a few clarifying questions. Please respond with the necessary details so I can assist further." + concatenated_text = f"{QPROMPT} {samples['problem']}" + f"{APROMPT} {samples['answer']}" + result = tokenizer( + concatenated_text, + truncation=True, + max_length=2048, + padding=False, + return_tensors=None, + ) + result["labels"] = result["input_ids"].copy() + return result + + if args.tokenize_version == 1: + tokenize_fn = tokenize_v1 + elif args.tokenize_version == 2: + tokenize_fn = tokenize_v2 + elif args.tokenize_version == 3: + tokenize_fn = tokenize_v3 + elif args.tokenize_version == 4: + tokenize_fn = tokenize_v4 + + # Load the dataset + data = load_dataset('json', data_files=args.dataset_path) + tokenized_data = data.map(tokenize_fn) val_dataset = tokenized_data['train'].train_test_split(test_size=0.2, seed=42)['test'] # Define the Trainer training_args = TrainingArguments( - per_device_eval_batch_size=8, - output_dir=output_dir, - remove_unused_columns=True, + per_device_eval_batch_size=16, + output_dir='./results', + logging_dir='./logs', ) trainer = Trainer( model=model, - eval_dataset=val_dataset, args=training_args, - data_collator=DataCollatorForSeq2Seq( - tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True - ), + eval_dataset=val_dataset, + compute_metrics=compute_metrics, ) # Evaluate the model results = trainer.evaluate() print(results) - return results - if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--model_path', type=str, required=True, help='Path to the fine-tuned model') - parser.add_argument('--dataset_path', type=str, required=True, help='Path to the dataset') - parser.add_argument('--output_dir', type=str, required=True, help='Directory to save evaluation results') - - args = parser.parse_args() - - evaluate_model(args.model_path, args.dataset_path, args.output_dir) + main()