DAMO-ConvAI/pcll/final_score.py

67 lines
3.3 KiB
Python

import json
import os
from settings import parse_args
args = parse_args()
# TODO: Calculate average scores and average forgetting. =================================================================================
test_res = open(args.res_dir+'res.txt', 'a')
# * Load saved scores.
score_all_time=[]
with open(os.path.join(args.output_dir, "metrics.json"),"r") as f:
score_all_time = [json.loads(row) for row in f.readlines()]
# score_per_time = json.load(f)
# score_all_time.append(score_per_time)
# * Calculate average score of all tasks observed.
score_last = {}
average_score = []
score_last_time = score_all_time[-1] # last time and last epoch
print('Last time scores for all tasks:\n',score_last_time, file=test_res)
print('Last time scores for all tasks:\n',score_last_time, flush=True)
for task in score_last_time:
if args.data_type == 'intent':
average_score.append(score_last_time[task]['intent_acc'])
elif args.data_type == 'slot':
average_score.append(score_last_time[task]['slot_f1'])
# * Calculate average forgetting of all tasks observed.
score_highest = score_all_time[0]
for score_per_time in score_all_time[:-1]: # Not calculate the final time.
for task_name in score_highest:
if args.data_type == 'intent': # * CLS tasks
if score_highest[task_name]['intent_acc'] < score_per_time[task_name]['intent_acc']:
score_highest[task_name]['intent_acc'] = score_per_time[task_name]['intent_acc']
if args.data_type == 'slot': # * SlotTagging tasks
if score_highest[task_name]['slot_f1'] < score_per_time[task_name]['slot_f1']:
score_highest[task_name]['slot_f1'] = score_per_time[task_name]['slot_f1']
average_forgetting = []
average_forgetting_ratio = []
forgetting_dict = {}
for task in score_highest:
if task != args.tasks[-1]:
# * intent detection
if args.data_type == 'intent':
diff = score_highest[task]['intent_acc']-score_last_time[task]['intent_acc']
forgetting_dict[task]=diff
average_forgetting.append(diff)
# * slot tagging
if args.data_type == 'slot':
diff = score_highest[task]['slot_f1']-score_last_time[task]['slot_f1']
forgetting_dict[task]=diff
diff_ratio = (score_highest[task]['slot_f1']-score_last_time[task]['slot_f1']) / score_highest[task]['slot_f1']
average_forgetting.append(diff)
average_forgetting_ratio.append(diff_ratio)
print('Last time forgetting evaluation for all tasks:\n',forgetting_dict, file=test_res)
print('Last time forgetting evaluation for all tasks:\n',forgetting_dict, flush=True)
# * Print out final results of average accuracy and average forgetting.
print('Average score is %.4f'%(sum(average_score)/len(average_score)), file=test_res)
print('Average score is %.4f'%(sum(average_score)/len(average_score)), flush=True)
print('Average forgetting is %.4f'%(sum(average_forgetting)/len(average_forgetting)), file=test_res)
print('Average forgetting is %.4f'%(sum(average_forgetting)/len(average_forgetting)), flush=True)
print('Average forgetting ratio is %.4f'%(sum(average_forgetting_ratio)/len(average_forgetting_ratio)), file=test_res)
print(sum(average_score)/len(average_score),'\t', sum(average_forgetting)/len(average_forgetting), sum(average_forgetting_ratio)/len(average_forgetting_ratio), file=test_res)