Source code for fairdiverse.search.utils.process_bm25

import os
import math
import json
import pickle
import numpy as np
from collections import Counter
from tqdm import tqdm

from .utils import pkl_save

MAXDOC = 50


[docs] def generate_bm25_scores_for_query(config): """ Generates BM25 relevance scores for queries and their suggested variations against documents. This function processes each query and its suggested alternatives, calculating BM25 scores against a collection of documents. The scores are computed for both the original query and its suggestions, then saved to a pickle file. :param config: A dictionary containing configuration parameters including data directories and model settings. """ qd = pickle.load(open(os.path.join(config['data_dir'], 'div_query.data'), 'rb')) data_content_dir = config['data_content_dir'] doc_content_dict = {} for fname in os.listdir(data_content_dir): with open(os.path.join(data_content_dir, fname), 'r') as fr: data_dict = json.load(fr) doc_content_dict[fname[:-5]] = data_dict bm25_dict = {} for qid in tqdm(qd): bm25_dict[qid] = [{}] query = qd[qid].query doc_list = qd[qid].doc_list[:MAXDOC] query_suggestion = qd[qid].query_suggestion doc_content_list = [doc_content_dict[doc_id]['body_text'] for doc_id in doc_list] bm25_scores = calculate_bm25(query, doc_content_list) for i, doc_id in enumerate(doc_list): bm25_dict[qid][0][doc_id] = bm25_scores[i] sub_query_bm25 = [] for sub_query in query_suggestion: bm25_scores = calculate_bm25(sub_query, doc_content_list) sub_query_bm25.append(bm25_scores) bm25_dict[qid].append(np.array(sub_query_bm25)) if not os.path.exists(os.path.join(config['data_dir'], config['model'])): os.makedirs(os.path.join(config['data_dir'], config['model'])) pkl_save(bm25_dict, os.path.join(config['data_dir'], config['model'], 'bm25_scores.pkl'))
[docs] def calculate_bm25(query, documents, k1=1.5, b=0.75): """ Calculates BM25 relevance scores between a query and a list of documents. This function implements the BM25 ranking algorithm, which combines term frequency, inverse document frequency, and document length normalization to score document relevance to a query. :param query: A string containing the search query :param documents: A list of strings, where each string is a document's text :param k1: Float parameter controlling term frequency scaling (default: 1.5) :param b: Float parameter controlling document length normalization (default: 0.75) :return: A list of float values representing BM25 scores for each document """ N = len(documents) avgdl = sum(len(doc.split()) for doc in documents) / N bm25_scores = [] def idf(term): df = sum(1 for doc in documents if term in doc.split()) return math.log((N - df + 0.5) / (df + 0.5) + 1.0) for doc in documents: score = 0.0 doc_terms = Counter(doc.split()) doc_length = len(doc.split()) for term in query.split(): if term in doc_terms: f = doc_terms[term] term_idf = idf(term) score += term_idf * (f * (k1 + 1)) / (f + k1 * (1 - b + b * (doc_length / avgdl))) bm25_scores.append(score) return bm25_scores