Source code for fairdiverse.search.utils.utils

import os
import csv
import math
import gzip
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler



[docs]
def split_list(origin_list, n):
    """
    Splits the input list into smaller sublists of size n (or close to n).

    :param origin_list: The original list to be split.
    :param n: The number of sublists to split into.
    :return: A list of sublists.
    """
    res_list = []
    L = len(origin_list)
    N = int(math.ceil(L / float(n)))
    begin = 0
    end = begin + N
    while begin < L:
        if end < L:
            temp_list = origin_list[begin:end]
            res_list.append(temp_list)
            begin = end
            end += N
        else:
            temp_list = origin_list[begin:]
            res_list.append(temp_list)
            break
    return res_list




[docs]
def load_embedding(filename, sep = '\t'):
    """
    Load embedding from file
    :param filename: embedding file name
    :param sep: the char used as separation symbol
    :return: a dict with item name as key and embedding vector as value
    """

    with open(filename, 'r') as fp:
        result = {}
        for l in fp:
            l = l.strip()
            if l == '':
                continue
            sp = l.split(sep)
            vals = [float(sp[i]) for i in range(1, len(sp))]
            result[sp[0]] = vals
        return result

    


[docs]
def get_rel_feat(path):
    """
    Loads and scales the relevance features from a CSV file.

    :param path: Path to the CSV file containing the relevance features.
    :return: A dictionary where the key is a tuple (query, doc) and the value is a list of features.
    """
    rel_feat = pd.read_csv(path)
    rel_feat_names = list(sorted(set(rel_feat.columns) - {'query', 'doc'}))
    rel_feat[rel_feat_names] = StandardScaler().fit_transform(rel_feat[rel_feat_names])
    rel_feat = dict(zip(map(lambda x: tuple(x), rel_feat[['query', 'doc']].values),
            rel_feat[rel_feat_names].values.tolist()))
    return rel_feat




[docs]
def read_rel_feat(path):
    """
    Reads relevance features from a CSV file and returns them in a nested dictionary format.

    :param path: Path to the CSV file containing the relevance features.
    :return: A nested dictionary where the key is a query and the value is another dictionary of documents and features.
    """
    rel_feat = {}
    f = csv.reader(open(path, 'r'), delimiter = ',')
    next(f)
    for line in f:
        if line[0] not in rel_feat:
            rel_feat[line[0]] = {}
        if line[1] not in rel_feat[line[0]]:
            rel_feat[line[0]][line[1]] = np.array([float(val) for val in line[2:]])
    return rel_feat




[docs]
def pkl_load(filename):
    """
    Loads a pickle file and returns the data inside it.

    :param filename: Path to the pickle file.
    :return: The loaded data from the pickle file.
    """
    if not os.path.exists(filename):
        print('filename={} not exists!')
        return
    with gzip.open(filename, 'rb') as f:
        data_dict = pickle.load(f)
    return data_dict




[docs]
def pkl_save(data_dict, filename):
    """ Saves a dictionary to a compressed pickle file.

    :param data_dict: The dictionary to be saved.
    :param filename: The path where the pickle file should be saved.
    """
    with gzip.open(filename, 'wb') as f:
        pickle.dump(data_dict, f)




[docs]
def remove_duplicate(input_path, output_path):
    """ Removes duplicate documents in the ranking list.

    :param input_path: The path to the input file containing the ranking list.
    :param output_path: The path where the cleaned ranking list will be saved.
    """
    unique_records = set()
    output_lines = []

    with open(input_path, "r") as file:
        for line in file:
            parts = line.strip().split()
            topic_id = parts[0]
            document_name = parts[2]
            
            key = (topic_id, document_name)
            
            if key not in unique_records:
                unique_records.add(key)
                output_lines.append(line)

    with open(output_path, "w") as file:
        for line in output_lines:
            file.write(line)




[docs]
def restore_doc_ids(order_str, id_dict):
    """ Restores document IDs based on an ordered list of indices and a dictionary of document IDs.

    :param order_str: A string representing the order of document indices.
    :param id_dict: A dictionary mapping indices to document IDs.
    :return: A list of document IDs in the restored order.
    """
    order = [int(x) for x in order_str.replace(" ", "").replace("[", "").replace("]", "").split(">")]
    reversed_dict = {v: k for k, v in id_dict.items()}
    return [reversed_dict[num] for num in order if num in reversed_dict]




[docs]
def get_metrics_20(csv_file_path):
    """
    Retrieves evaluation metrics from a CSV file for the top 20 documents.

    :param csv_file_path: The path to the CSV file containing evaluation results.
    :return: A tuple containing the mean values of alpha-nDCG@20, NRBP@20, ERR-IA@20, and strec@20.
    """
    all_qids=range(1,201)
    del_index=[94,99]
    all_qids=np.delete(all_qids,del_index)
    qids=[str(i) for i in all_qids]

    df=pd.read_csv(csv_file_path)

    alpha_nDCG_20=df.loc[df['topic'].isin(qids)]['alpha-nDCG@20'].mean()
    NRBP_20=df.loc[df['topic'].isin(qids)]['NRBP'].mean()
    ERR_IA_20=df.loc[df['topic'].isin(qids)]['ERR-IA@20'].mean()
    # Pre_IA_20=df.loc[df['topic'].isin(qids)]['P-IA@20'].mean()
    S_rec_20=df.loc[df['topic'].isin(qids)]['strec@20'].mean()
    
    
    return alpha_nDCG_20, NRBP_20, ERR_IA_20, S_rec_20