Skip to content

Similarity utils

Similarity utilities for search and discovery.

compute_bm25_score(query, document, additional_context=None)

Compute BM25 score manually for a query and document, optionally including additional context.

Parameters:

Name Type Description Default
query

Query string to search for.

required
document

Document string to search in.

required
additional_context

Optional additional context string to consider. Defaults to None.

None

Returns:

Type Description

BM25 similarity score between query and document (or best match with additional context).

Source code in blue/utils/similarity_utils.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def compute_bm25_score(query, document, additional_context=None):
    """Compute BM25 score manually for a query and document, optionally including additional context.

    Parameters:
        query: Query string to search for.
        document: Document string to search in.
        additional_context: Optional additional context string to consider. Defaults to None.

    Returns:
        BM25 similarity score between query and document (or best match with additional context).
    """
    if not query:
        return 0.0

    doc_score = _compute_bm25_score_single(query, document)
    if additional_context:
        context_score = _compute_bm25_score_single(query, additional_context)
        return max(doc_score, context_score)

    return doc_score

compute_vector_score(query_vector, doc_vector, normalize_score=True)

Compute semantic similarity between two embedding vectors using cosine similarity.

Parameters:

Name Type Description Default
query_vector Union[bytes, ndarray]

Query embedding vector as bytes or numpy array.

required
doc_vector Union[bytes, ndarray]

Document embedding vector as bytes or numpy array.

required
normalize_score bool

Whether to normalize score to [0,1] range. Defaults to True.

True

Returns:

Type Description
float

Similarity score in [0,1] if normalize_score=True, otherwise [-1,1].

Source code in blue/utils/similarity_utils.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def compute_vector_score(query_vector: Union[bytes, np.ndarray], doc_vector: Union[bytes, np.ndarray], normalize_score: bool = True) -> float:
    """Compute semantic similarity between two embedding vectors using cosine similarity.

    Parameters:
        query_vector: Query embedding vector as bytes or numpy array.
        doc_vector: Document embedding vector as bytes or numpy array.
        normalize_score: Whether to normalize score to [0,1] range. Defaults to True.

    Returns:
        Similarity score in [0,1] if normalize_score=True, otherwise [-1,1].
    """
    if not query_vector or not doc_vector:
        return 0.0

    # Convert bytes back to numpy arrays
    if isinstance(query_vector, bytes):
        query_array = np.frombuffer(query_vector, dtype=np.float32)
    else:
        query_array = query_vector
    if isinstance(doc_vector, bytes):
        doc_array = np.frombuffer(doc_vector, dtype=np.float32)
    else:
        doc_array = doc_vector

    # Compute norms
    query_norm = np.linalg.norm(query_array)
    doc_norm = np.linalg.norm(doc_array)
    if query_norm == 0 or doc_norm == 0:
        return 0.0

    # L2 normalize
    query_array = query_array / query_norm
    doc_array = doc_array / doc_norm

    # Cosine similarity
    similarity = float(np.dot(query_array, doc_array))

    # Normalize to [0,1]
    if normalize_score:
        similarity = (similarity + 1.0) / 2.0

    return similarity

normalize_bm25_scores(scores, method='minmax', max_score=20.0)

Normalize BM25 scores using different methods.

Parameters:

Name Type Description Default
scores

List of BM25 scores to normalize.

required
method

Normalization method ('linear', 'log', 'minmax'). Defaults to 'minmax'.

'minmax'
max_score

Maximum score for linear normalization. Defaults to 20.0.

20.0

Returns:

Type Description

List of normalized scores in the same order as input scores.

Source code in blue/utils/similarity_utils.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def normalize_bm25_scores(scores, method='minmax', max_score=20.0):
    """Normalize BM25 scores using different methods.

    Parameters:
        scores: List of BM25 scores to normalize.
        method: Normalization method ('linear', 'log', 'minmax'). Defaults to 'minmax'.
        max_score: Maximum score for linear normalization. Defaults to 20.0.

    Returns:
        List of normalized scores in the same order as input scores.
    """
    if not scores:
        return []

    # Validate method
    valid_methods = ['linear', 'log', 'minmax']
    if method not in valid_methods:
        method = 'minmax'

    scores = np.array(scores, dtype=float)

    if method == 'linear':
        # Linear normalization with max score
        normalized = np.minimum(scores / max_score, 1.0)
    elif method == 'log':
        # Log scaling
        log_scores = np.log1p(scores)
        # Min-max scaling to [0,1]
        score_range = log_scores.max() - log_scores.min()
        if score_range < 1e-8:  # Single score or identical scores
            normalized = np.ones_like(log_scores)  # Return 1.0 for all scores
        else:
            normalized = (log_scores - log_scores.min()) / score_range
    elif method == 'minmax':
        # Min-max scaling to [0,1]
        score_range = scores.max() - scores.min()
        if score_range < 1e-8:  # Single score or identical scores
            normalized = np.ones_like(scores)  # Return 1.0 for all scores
        else:
            normalized = (scores - scores.min()) / score_range
    else:
        # Default to minmax
        score_range = scores.max() - scores.min()
        if score_range < 1e-8:
            normalized = np.ones_like(scores)
        else:
            normalized = (scores - scores.min()) / score_range

    # Return list of normalized scores in the same order as input
    return [float(norm) for norm in normalized]
Last update: 2025-10-05