import os
import csv
import math
import gzip
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
[docs]
def split_list(origin_list, n):
"""
Splits the input list into smaller sublists of size n (or close to n).
:param origin_list: The original list to be split.
:param n: The number of sublists to split into.
:return: A list of sublists.
"""
res_list = []
L = len(origin_list)
N = int(math.ceil(L / float(n)))
begin = 0
end = begin + N
while begin < L:
if end < L:
temp_list = origin_list[begin:end]
res_list.append(temp_list)
begin = end
end += N
else:
temp_list = origin_list[begin:]
res_list.append(temp_list)
break
return res_list
[docs]
def load_embedding(filename, sep = '\t'):
"""
Load embedding from file
:param filename: embedding file name
:param sep: the char used as separation symbol
:return: a dict with item name as key and embedding vector as value
"""
with open(filename, 'r') as fp:
result = {}
for l in fp:
l = l.strip()
if l == '':
continue
sp = l.split(sep)
vals = [float(sp[i]) for i in range(1, len(sp))]
result[sp[0]] = vals
return result
[docs]
def get_rel_feat(path):
"""
Loads and scales the relevance features from a CSV file.
:param path: Path to the CSV file containing the relevance features.
:return: A dictionary where the key is a tuple (query, doc) and the value is a list of features.
"""
rel_feat = pd.read_csv(path)
rel_feat_names = list(sorted(set(rel_feat.columns) - {'query', 'doc'}))
rel_feat[rel_feat_names] = StandardScaler().fit_transform(rel_feat[rel_feat_names])
rel_feat = dict(zip(map(lambda x: tuple(x), rel_feat[['query', 'doc']].values),
rel_feat[rel_feat_names].values.tolist()))
return rel_feat
[docs]
def read_rel_feat(path):
"""
Reads relevance features from a CSV file and returns them in a nested dictionary format.
:param path: Path to the CSV file containing the relevance features.
:return: A nested dictionary where the key is a query and the value is another dictionary of documents and features.
"""
rel_feat = {}
f = csv.reader(open(path, 'r'), delimiter = ',')
next(f)
for line in f:
if line[0] not in rel_feat:
rel_feat[line[0]] = {}
if line[1] not in rel_feat[line[0]]:
rel_feat[line[0]][line[1]] = np.array([float(val) for val in line[2:]])
return rel_feat
[docs]
def pkl_load(filename):
"""
Loads a pickle file and returns the data inside it.
:param filename: Path to the pickle file.
:return: The loaded data from the pickle file.
"""
if not os.path.exists(filename):
print('filename={} not exists!')
return
with gzip.open(filename, 'rb') as f:
data_dict = pickle.load(f)
return data_dict
[docs]
def pkl_save(data_dict, filename):
""" Saves a dictionary to a compressed pickle file.
:param data_dict: The dictionary to be saved.
:param filename: The path where the pickle file should be saved.
"""
with gzip.open(filename, 'wb') as f:
pickle.dump(data_dict, f)
[docs]
def remove_duplicate(input_path, output_path):
""" Removes duplicate documents in the ranking list.
:param input_path: The path to the input file containing the ranking list.
:param output_path: The path where the cleaned ranking list will be saved.
"""
unique_records = set()
output_lines = []
with open(input_path, "r") as file:
for line in file:
parts = line.strip().split()
topic_id = parts[0]
document_name = parts[2]
key = (topic_id, document_name)
if key not in unique_records:
unique_records.add(key)
output_lines.append(line)
with open(output_path, "w") as file:
for line in output_lines:
file.write(line)
[docs]
def restore_doc_ids(order_str, id_dict):
""" Restores document IDs based on an ordered list of indices and a dictionary of document IDs.
:param order_str: A string representing the order of document indices.
:param id_dict: A dictionary mapping indices to document IDs.
:return: A list of document IDs in the restored order.
"""
order = [int(x) for x in order_str.replace(" ", "").replace("[", "").replace("]", "").split(">")]
reversed_dict = {v: k for k, v in id_dict.items()}
return [reversed_dict[num] for num in order if num in reversed_dict]
[docs]
def get_metrics_20(csv_file_path):
"""
Retrieves evaluation metrics from a CSV file for the top 20 documents.
:param csv_file_path: The path to the CSV file containing evaluation results.
:return: A tuple containing the mean values of alpha-nDCG@20, NRBP@20, ERR-IA@20, and strec@20.
"""
all_qids=range(1,201)
del_index=[94,99]
all_qids=np.delete(all_qids,del_index)
qids=[str(i) for i in all_qids]
df=pd.read_csv(csv_file_path)
alpha_nDCG_20=df.loc[df['topic'].isin(qids)]['alpha-nDCG@20'].mean()
NRBP_20=df.loc[df['topic'].isin(qids)]['NRBP'].mean()
ERR_IA_20=df.loc[df['topic'].isin(qids)]['ERR-IA@20'].mean()
# Pre_IA_20=df.loc[df['topic'].isin(qids)]['P-IA@20'].mean()
S_rec_20=df.loc[df['topic'].isin(qids)]['strec@20'].mean()
return alpha_nDCG_20, NRBP_20, ERR_IA_20, S_rec_20