Skip to content

Metadata

Bases: ServiceClient

Source code in blue/metadata.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
class MetaData(ServiceClient):

    def __init__(self, properties=None):
        self.name = "metadata"
        super().__init__(self.name, properties=properties)
        self._init_metadata_properties()

    ###### initialization
    def _init_metadata_properties(self):

        self.properties['openai.api'] = 'ChatCompletion'
        self.properties['openai.model'] = "gpt-4o"
        self.properties['input_json'] = "[{\"role\": \"user\"}]"
        self.properties['input_context'] = "$[0]"
        self.properties['input_context_field'] = "content"
        self.properties['input_field'] = "messages"
        self.properties['input_template'] = "${input}"
        self.properties['output_path'] = '$.choices[0].message.content'
        self.properties['openai.stream'] = False
        self.properties['openai.max_tokens'] = 300

        # prefix for service specific properties
        self.properties['service_prefix'] = 'openai'
        # service_url, set as default
        self.properties["service_url"] = PROPERTIES["services.openai.service_url"]

        # transformations
        self.properties['output_transformations'] = [{"transformation": "replace", "from": "```", "to": ""}, {"transformation": "replace", "from": "json", "to": ""}]
        self.properties['output_strip'] = True

        self.properties['enable_entity_description_generation'] = True
        self.properties['enable_attribute_description_generation'] = True

        # Description aggregation from children
        self.properties['aggregation_prompt'] = AGGREGATION_PROMPT
        self.properties['enable_database_description_generation'] = True
        self.properties['enable_collection_description_generation'] = True

    def build_entity_description_prompt(self, entity_obj, attributes):
        """
        Build a prompt for generating an entity description using an LLM.

        Constructs a structured text prompt containing entity metadata and attribute
        information, suitable for guiding an LLM to produce a JSON-formatted
        description of the entity and its attributes.

        Parameters:
            entity_obj (dict): A dictionary representing the entity, from
                the data registry.
            attributes (list[dict]): A list of attribute definitions.

        Returns:
            str: A formatted multi-line string prompt, instructing the LLM to produce
            a JSON object with:
                - "table_description": Human-readable description of the entity.
                - "attributes": Mapping of attribute names to their descriptions.

        """

        # Extract basic info
        name = entity_obj.get("name", "Unknown")
        scope = entity_obj.get("scope", "Unknown")
        etype = entity_obj.get("type", "Unknown")

        attr_lines = []

        for attr in attributes:
            attr_properties = attr.get("properties", {})
            attr_properties_info = attr_properties.get("info", {})
            attr_type = attr_properties_info.get("type", "unknown")

            attr_name = attr.get("name")
            attr_stats = attr_properties.get("stats", {})

            sample_values = attr_stats.get("sample_values", [])

            attr_lines.append(f"- {attr_name} ({attr_type}), samples: {', '.join(map(str, sample_values[:3]))}")

        # Build the final prompt
        prompt = f"""
        You are given a database entity definition with its attributes and metadata.
        Your task is to generate a structured JSON output with:
        1. A concise human-readable description of what this table/entity represents.
        2. Concise descriptions of each attribute.

        Entity Name: {name}
        Scope: {scope}
        Type: {etype}

        Attributes:
        {chr(10).join(attr_lines)}

        Output JSON format (do not include extra commentary, only valid JSON):

        {{
        "table_description": "string",
        "attributes": {{
            "attr_name": "description of attribute",
            ...
        }}
        }}
        """
        return prompt

    def enrich_entity(self, entity, attributes):
        """
        Generate an enriched description for an entity using its attributes.

        Builds a prompt from the entity and its attributes, then calls the
        external LLM API to produce the enriched description.

        Parameters:
            entity (dict): The entity metadata to enrich.
            attributes (dict): Attribute data associated with the entity.

        Returns:
            str: Enriched description text generated by the API.
        """
        entity_prompt = self.build_entity_description_prompt(entity, attributes)
        return self.execute_api_call(entity_prompt, properties=self.properties, additional_data={})

    def collect_source_metadata(self, data_registry, source, recursive=False, rebuild=False):
        """
        Collect and optionally recursively enrich metadata for a data source.

        If recursive is True, iterates through all databases under the source
        and collects/enriches their metadata.

        Parameters:
            data_registry (DataRegistry): Registry instance for metadata access/storage.
            source (str): Identifier of the data source.
            recursive (bool, optional): Whether to include child databases. Defaults to False.
            rebuild (bool, optional): Whether to regenerate existing descriptions. Defaults to False.

        Returns:
            None
        """
        if recursive:
            databases = data_registry.get_source_databases(source)
            for database in databases:
                self.collect_source_database_metadata(data_registry, source, database, recursive=recursive, rebuild=rebuild)
        return

    def collect_source_database_metadata(self, data_registry, source, database, recursive=False, rebuild=False):
        """
        Collect and enrich metadata for a database within a data source.

        This method checks whether the database already has a description. If not,
        it uses available metadata and collection descriptions to generate an
        enriched description (via `enrich_database_description`) and stores it
        back into the data registry. Optionally, it can also recurse into
        collections to collect their metadata.

        Parameters:
            data_registry (DataRegistry): The registry object that manages sources,
                databases, collections, and metadata.
            source (str): Identifier for the data source.
            database (str): Name of the database to collect metadata for.
            recursive (bool, optional): If True, also collect metadata for all
                collections within the database. Defaults to False.
            rebuild (bool, optional): If True, forces metadata to be rebuilt or
                refreshed even if it already exists. Defaults to False.

        Returns:
            None

        """
        collections = data_registry.get_source_database_collections(source, database)
        collection_descriptions = {}

        if self.properties.get('enable_database_description_generation', True):
            current_description = data_registry.get_source_database_description(source, database)
            if not current_description or current_description.strip() == "":

                database_metadata = data_registry.get_source_database_property(source, database, "metadata")

                if not database_metadata:
                    database_metadata = {"name": database, "type": "database"}

                for collection in collections:
                    collection_name = collection.get("name")
                    collection_desc = collection.get("description")
                    collection_descriptions[collection_name] = collection_desc

                database_desc = self.enrich_database_description(database, collection_descriptions, database_metadata)

                data_registry.set_source_database_description(source, database, database_desc, rebuild=rebuild)

        if recursive:
            for collection in collections:
                self.collect_source_database_collection_metadata(data_registry, source, database, collection, recursive=recursive, rebuild=rebuild)

        return

    def collect_source_database_collection_metadata(self, data_registry, source, database, collection, recursive=False, rebuild=False):
        """
        Collect and enrich metadata for a specific collection and its entities within a database.

        For each entity in the collection, generates enriched table and attribute descriptions
        using the LLM-based enrichment process, and stores them in the data registry if missing.
        Optionally, also generates a collection-level description.

        Parameters:
            data_registry (DataRegistry): Registry instance for accessing and storing metadata.
            source (str): Identifier of the data source.
            database (str): Name of the database containing the collection.
            collection (str): Name of the collection to process.
            recursive (bool, optional): Whether to process nested collections or entities. Defaults to False.
            rebuild (bool, optional): Whether to regenerate existing descriptions. Defaults to False.

        Returns:
            None
        """

        entities = data_registry.get_source_database_collection_entities(source, database, collection)

        entity_descriptions = {}
        for entity in entities:
            entity_name = entity.get("name")

            attributes = data_registry.get_source_database_collection_entity_attributes(source, database, collection, entity_name)

            entity_attribute_description = self.enrich_entity(entity, attributes)

            try:
                parsed = json_utils.safe_json_parse(entity_attribute_description)
                if not parsed:
                    logging.warning(f"Entity {entity} returned invalid or empty JSON.")
                    continue
            except json.JSONDecodeError:
                logging.warning("LLM did not return valid JSON. Skipping entity enrichment.")
                parsed = {}

            table_desc = parsed.get("table_description", "")
            attribute_descs = parsed.get("attributes", {})
            entity_descriptions[entity_name] = table_desc

            if self.properties.get('enable_entity_description_generation', True):
                current_description = data_registry.get_source_database_collection_entity_description(source, database, collection, entity_name)

                if not current_description or current_description.strip() == "":
                    data_registry.set_source_database_collection_entity_description(source, database, collection, entity_name, table_desc, rebuild=rebuild)

            if self.properties.get('enable_attribute_description_generation', True):
                for attr, desc in attribute_descs.items():
                    current_description = data_registry.get_source_database_collection_entity_attribute_description(source, database, collection, entity_name, attr)
                    if not current_description or current_description.strip() == "":
                        data_registry.set_source_database_collection_entity_attribute_description(source, database, collection, entity_name, attr, desc, rebuild=rebuild)

        if self.properties.get('enable_collection_description_generation', True):
            current_description = data_registry.get_source_database_collection_description(source, database, collection)
            if not current_description or current_description.strip() == "":

                collection_metadata = data_registry.get_source_database_collection_property(source, database, collection, "metadata")

                if not collection_metadata:
                    collection_metadata = {"name": collection, "type": "collection"}

                collection_desc = self.enrich_collection_description(database, entity_descriptions, collection_metadata)

                data_registry.set_source_database_collection_description(source, database, collection, collection_desc, rebuild=rebuild)

    ###### Aggregation
    def build_collection_description_prompt(self, collection_name, entity_descriptions, collection_metadata):
        """
        Build a prompt string for generating or enriching a collection description.

        Constructs a formatted text prompt using entity-level descriptions and
        metadata, suitable for passing to an LLM or enrichment API.

        Parameters:
            collection_name (str): The name of the collection.
            entity_descriptions (dict): Mapping of entity names to their descriptions.
            collection_metadata (dict or str): Additional metadata for the collection.

        Returns:
            str: A formatted prompt string for collection-level description enrichment.
        """
        child_descriptions = [f"{name}: {desc}" for name, desc in entity_descriptions.items() if desc]
        if not child_descriptions:
            child_descriptions = ["No entity descriptions available"]

        return self.properties['aggregation_prompt'].format(
            child_type='entity',
            parent_type='collection',
            child_descriptions='\n'.join(child_descriptions),
            parent_metadata=f"Collection name: {collection_name}\nMetadata: {collection_metadata}",
        )

    def build_database_description_prompt(self, database_name, collection_descriptions, database_metadata):
        """
        Build a prompt string for generating or enriching a database description.

        Constructs a formatted text prompt using collection-level descriptions and
        metadata, suitable for passing to an LLM or enrichment API.

        Parameters:
            database_name (str): The name of the database.
            collection_descriptions (dict): Mapping of collection names to their descriptions.
            database_metadata (dict or str): Additional metadata for the database.

        Returns:
            str: A formatted prompt string for database-level description enrichment.
        """

        child_descriptions = [f"{name}: {desc}" for name, desc in collection_descriptions.items() if desc]
        if not child_descriptions:
            child_descriptions = ["No collection descriptions available"]

        return self.properties['aggregation_prompt'].format(
            child_type='collection', parent_type='database', child_descriptions='\n'.join(child_descriptions), parent_metadata=f"Database name: {database_name}\nMetadata: {database_metadata}"
        )

    def enrich_collection_description(self, collection_name, entity_descriptions, collection_metadata):
        """
        Enrich a collection description using LLM.

        Builds a prompt from the provided entity descriptions and metadata, then
        executes an LLM call to generate or refine the collection-level description.

        Parameters:
            collection_name (str): The name of the collection.
            entity_descriptions (dict): Mapping of entity names to their descriptions.
            collection_metadata (dict or str): Additional metadata for the collection.

        Returns:
            Any: The enriched collection description, as returned by the LLM.
        """
        prompt = self.build_collection_description_prompt(collection_name, entity_descriptions, collection_metadata)
        return self.execute_api_call(prompt, properties=self.properties, additional_data={})

    def enrich_database_description(self, database_name, collection_descriptions, database_metadata):
        """
        Enrich a database description using LLM.

        Builds a prompt from the provided collection descriptions and metadata, then
        executes an API call to generate or refine the database-level description.

        Parameters:
            database_name (str): The name of the database.
            collection_descriptions (dict): Mapping of collection names to their descriptions.
            database_metadata (dict or str): Additional metadata for the database.

        Returns:
            Any: The enriched database description.
        """

        prompt = self.build_database_description_prompt(database_name, collection_descriptions, database_metadata)
        return self.execute_api_call(prompt, properties=self.properties, additional_data={})

build_collection_description_prompt(collection_name, entity_descriptions, collection_metadata)

Build a prompt string for generating or enriching a collection description.

Constructs a formatted text prompt using entity-level descriptions and metadata, suitable for passing to an LLM or enrichment API.

Parameters:

Name Type Description Default
collection_name str

The name of the collection.

required
entity_descriptions dict

Mapping of entity names to their descriptions.

required
collection_metadata dict or str

Additional metadata for the collection.

required

Returns:

Name Type Description
str

A formatted prompt string for collection-level description enrichment.

Source code in blue/metadata.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def build_collection_description_prompt(self, collection_name, entity_descriptions, collection_metadata):
    """
    Build a prompt string for generating or enriching a collection description.

    Constructs a formatted text prompt using entity-level descriptions and
    metadata, suitable for passing to an LLM or enrichment API.

    Parameters:
        collection_name (str): The name of the collection.
        entity_descriptions (dict): Mapping of entity names to their descriptions.
        collection_metadata (dict or str): Additional metadata for the collection.

    Returns:
        str: A formatted prompt string for collection-level description enrichment.
    """
    child_descriptions = [f"{name}: {desc}" for name, desc in entity_descriptions.items() if desc]
    if not child_descriptions:
        child_descriptions = ["No entity descriptions available"]

    return self.properties['aggregation_prompt'].format(
        child_type='entity',
        parent_type='collection',
        child_descriptions='\n'.join(child_descriptions),
        parent_metadata=f"Collection name: {collection_name}\nMetadata: {collection_metadata}",
    )

build_database_description_prompt(database_name, collection_descriptions, database_metadata)

Build a prompt string for generating or enriching a database description.

Constructs a formatted text prompt using collection-level descriptions and metadata, suitable for passing to an LLM or enrichment API.

Parameters:

Name Type Description Default
database_name str

The name of the database.

required
collection_descriptions dict

Mapping of collection names to their descriptions.

required
database_metadata dict or str

Additional metadata for the database.

required

Returns:

Name Type Description
str

A formatted prompt string for database-level description enrichment.

Source code in blue/metadata.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def build_database_description_prompt(self, database_name, collection_descriptions, database_metadata):
    """
    Build a prompt string for generating or enriching a database description.

    Constructs a formatted text prompt using collection-level descriptions and
    metadata, suitable for passing to an LLM or enrichment API.

    Parameters:
        database_name (str): The name of the database.
        collection_descriptions (dict): Mapping of collection names to their descriptions.
        database_metadata (dict or str): Additional metadata for the database.

    Returns:
        str: A formatted prompt string for database-level description enrichment.
    """

    child_descriptions = [f"{name}: {desc}" for name, desc in collection_descriptions.items() if desc]
    if not child_descriptions:
        child_descriptions = ["No collection descriptions available"]

    return self.properties['aggregation_prompt'].format(
        child_type='collection', parent_type='database', child_descriptions='\n'.join(child_descriptions), parent_metadata=f"Database name: {database_name}\nMetadata: {database_metadata}"
    )

build_entity_description_prompt(entity_obj, attributes)

Build a prompt for generating an entity description using an LLM.

Constructs a structured text prompt containing entity metadata and attribute information, suitable for guiding an LLM to produce a JSON-formatted description of the entity and its attributes.

Parameters:

Name Type Description Default
entity_obj dict

A dictionary representing the entity, from the data registry.

required
attributes list[dict]

A list of attribute definitions.

required

Returns:

Name Type Description
str

A formatted multi-line string prompt, instructing the LLM to produce

a JSON object with: - "table_description": Human-readable description of the entity. - "attributes": Mapping of attribute names to their descriptions.

Source code in blue/metadata.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def build_entity_description_prompt(self, entity_obj, attributes):
    """
    Build a prompt for generating an entity description using an LLM.

    Constructs a structured text prompt containing entity metadata and attribute
    information, suitable for guiding an LLM to produce a JSON-formatted
    description of the entity and its attributes.

    Parameters:
        entity_obj (dict): A dictionary representing the entity, from
            the data registry.
        attributes (list[dict]): A list of attribute definitions.

    Returns:
        str: A formatted multi-line string prompt, instructing the LLM to produce
        a JSON object with:
            - "table_description": Human-readable description of the entity.
            - "attributes": Mapping of attribute names to their descriptions.

    """

    # Extract basic info
    name = entity_obj.get("name", "Unknown")
    scope = entity_obj.get("scope", "Unknown")
    etype = entity_obj.get("type", "Unknown")

    attr_lines = []

    for attr in attributes:
        attr_properties = attr.get("properties", {})
        attr_properties_info = attr_properties.get("info", {})
        attr_type = attr_properties_info.get("type", "unknown")

        attr_name = attr.get("name")
        attr_stats = attr_properties.get("stats", {})

        sample_values = attr_stats.get("sample_values", [])

        attr_lines.append(f"- {attr_name} ({attr_type}), samples: {', '.join(map(str, sample_values[:3]))}")

    # Build the final prompt
    prompt = f"""
    You are given a database entity definition with its attributes and metadata.
    Your task is to generate a structured JSON output with:
    1. A concise human-readable description of what this table/entity represents.
    2. Concise descriptions of each attribute.

    Entity Name: {name}
    Scope: {scope}
    Type: {etype}

    Attributes:
    {chr(10).join(attr_lines)}

    Output JSON format (do not include extra commentary, only valid JSON):

    {{
    "table_description": "string",
    "attributes": {{
        "attr_name": "description of attribute",
        ...
    }}
    }}
    """
    return prompt

collect_source_database_collection_metadata(data_registry, source, database, collection, recursive=False, rebuild=False)

Collect and enrich metadata for a specific collection and its entities within a database.

For each entity in the collection, generates enriched table and attribute descriptions using the LLM-based enrichment process, and stores them in the data registry if missing. Optionally, also generates a collection-level description.

Parameters:

Name Type Description Default
data_registry DataRegistry

Registry instance for accessing and storing metadata.

required
source str

Identifier of the data source.

required
database str

Name of the database containing the collection.

required
collection str

Name of the collection to process.

required
recursive bool

Whether to process nested collections or entities. Defaults to False.

False
rebuild bool

Whether to regenerate existing descriptions. Defaults to False.

False

Returns:

Type Description

None

Source code in blue/metadata.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def collect_source_database_collection_metadata(self, data_registry, source, database, collection, recursive=False, rebuild=False):
    """
    Collect and enrich metadata for a specific collection and its entities within a database.

    For each entity in the collection, generates enriched table and attribute descriptions
    using the LLM-based enrichment process, and stores them in the data registry if missing.
    Optionally, also generates a collection-level description.

    Parameters:
        data_registry (DataRegistry): Registry instance for accessing and storing metadata.
        source (str): Identifier of the data source.
        database (str): Name of the database containing the collection.
        collection (str): Name of the collection to process.
        recursive (bool, optional): Whether to process nested collections or entities. Defaults to False.
        rebuild (bool, optional): Whether to regenerate existing descriptions. Defaults to False.

    Returns:
        None
    """

    entities = data_registry.get_source_database_collection_entities(source, database, collection)

    entity_descriptions = {}
    for entity in entities:
        entity_name = entity.get("name")

        attributes = data_registry.get_source_database_collection_entity_attributes(source, database, collection, entity_name)

        entity_attribute_description = self.enrich_entity(entity, attributes)

        try:
            parsed = json_utils.safe_json_parse(entity_attribute_description)
            if not parsed:
                logging.warning(f"Entity {entity} returned invalid or empty JSON.")
                continue
        except json.JSONDecodeError:
            logging.warning("LLM did not return valid JSON. Skipping entity enrichment.")
            parsed = {}

        table_desc = parsed.get("table_description", "")
        attribute_descs = parsed.get("attributes", {})
        entity_descriptions[entity_name] = table_desc

        if self.properties.get('enable_entity_description_generation', True):
            current_description = data_registry.get_source_database_collection_entity_description(source, database, collection, entity_name)

            if not current_description or current_description.strip() == "":
                data_registry.set_source_database_collection_entity_description(source, database, collection, entity_name, table_desc, rebuild=rebuild)

        if self.properties.get('enable_attribute_description_generation', True):
            for attr, desc in attribute_descs.items():
                current_description = data_registry.get_source_database_collection_entity_attribute_description(source, database, collection, entity_name, attr)
                if not current_description or current_description.strip() == "":
                    data_registry.set_source_database_collection_entity_attribute_description(source, database, collection, entity_name, attr, desc, rebuild=rebuild)

    if self.properties.get('enable_collection_description_generation', True):
        current_description = data_registry.get_source_database_collection_description(source, database, collection)
        if not current_description or current_description.strip() == "":

            collection_metadata = data_registry.get_source_database_collection_property(source, database, collection, "metadata")

            if not collection_metadata:
                collection_metadata = {"name": collection, "type": "collection"}

            collection_desc = self.enrich_collection_description(database, entity_descriptions, collection_metadata)

            data_registry.set_source_database_collection_description(source, database, collection, collection_desc, rebuild=rebuild)

collect_source_database_metadata(data_registry, source, database, recursive=False, rebuild=False)

Collect and enrich metadata for a database within a data source.

This method checks whether the database already has a description. If not, it uses available metadata and collection descriptions to generate an enriched description (via enrich_database_description) and stores it back into the data registry. Optionally, it can also recurse into collections to collect their metadata.

Parameters:

Name Type Description Default
data_registry DataRegistry

The registry object that manages sources, databases, collections, and metadata.

required
source str

Identifier for the data source.

required
database str

Name of the database to collect metadata for.

required
recursive bool

If True, also collect metadata for all collections within the database. Defaults to False.

False
rebuild bool

If True, forces metadata to be rebuilt or refreshed even if it already exists. Defaults to False.

False

Returns:

Type Description

None

Source code in blue/metadata.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def collect_source_database_metadata(self, data_registry, source, database, recursive=False, rebuild=False):
    """
    Collect and enrich metadata for a database within a data source.

    This method checks whether the database already has a description. If not,
    it uses available metadata and collection descriptions to generate an
    enriched description (via `enrich_database_description`) and stores it
    back into the data registry. Optionally, it can also recurse into
    collections to collect their metadata.

    Parameters:
        data_registry (DataRegistry): The registry object that manages sources,
            databases, collections, and metadata.
        source (str): Identifier for the data source.
        database (str): Name of the database to collect metadata for.
        recursive (bool, optional): If True, also collect metadata for all
            collections within the database. Defaults to False.
        rebuild (bool, optional): If True, forces metadata to be rebuilt or
            refreshed even if it already exists. Defaults to False.

    Returns:
        None

    """
    collections = data_registry.get_source_database_collections(source, database)
    collection_descriptions = {}

    if self.properties.get('enable_database_description_generation', True):
        current_description = data_registry.get_source_database_description(source, database)
        if not current_description or current_description.strip() == "":

            database_metadata = data_registry.get_source_database_property(source, database, "metadata")

            if not database_metadata:
                database_metadata = {"name": database, "type": "database"}

            for collection in collections:
                collection_name = collection.get("name")
                collection_desc = collection.get("description")
                collection_descriptions[collection_name] = collection_desc

            database_desc = self.enrich_database_description(database, collection_descriptions, database_metadata)

            data_registry.set_source_database_description(source, database, database_desc, rebuild=rebuild)

    if recursive:
        for collection in collections:
            self.collect_source_database_collection_metadata(data_registry, source, database, collection, recursive=recursive, rebuild=rebuild)

    return

collect_source_metadata(data_registry, source, recursive=False, rebuild=False)

Collect and optionally recursively enrich metadata for a data source.

If recursive is True, iterates through all databases under the source and collects/enriches their metadata.

Parameters:

Name Type Description Default
data_registry DataRegistry

Registry instance for metadata access/storage.

required
source str

Identifier of the data source.

required
recursive bool

Whether to include child databases. Defaults to False.

False
rebuild bool

Whether to regenerate existing descriptions. Defaults to False.

False

Returns:

Type Description

None

Source code in blue/metadata.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def collect_source_metadata(self, data_registry, source, recursive=False, rebuild=False):
    """
    Collect and optionally recursively enrich metadata for a data source.

    If recursive is True, iterates through all databases under the source
    and collects/enriches their metadata.

    Parameters:
        data_registry (DataRegistry): Registry instance for metadata access/storage.
        source (str): Identifier of the data source.
        recursive (bool, optional): Whether to include child databases. Defaults to False.
        rebuild (bool, optional): Whether to regenerate existing descriptions. Defaults to False.

    Returns:
        None
    """
    if recursive:
        databases = data_registry.get_source_databases(source)
        for database in databases:
            self.collect_source_database_metadata(data_registry, source, database, recursive=recursive, rebuild=rebuild)
    return

enrich_collection_description(collection_name, entity_descriptions, collection_metadata)

Enrich a collection description using LLM.

Builds a prompt from the provided entity descriptions and metadata, then executes an LLM call to generate or refine the collection-level description.

Parameters:

Name Type Description Default
collection_name str

The name of the collection.

required
entity_descriptions dict

Mapping of entity names to their descriptions.

required
collection_metadata dict or str

Additional metadata for the collection.

required

Returns:

Name Type Description
Any

The enriched collection description, as returned by the LLM.

Source code in blue/metadata.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def enrich_collection_description(self, collection_name, entity_descriptions, collection_metadata):
    """
    Enrich a collection description using LLM.

    Builds a prompt from the provided entity descriptions and metadata, then
    executes an LLM call to generate or refine the collection-level description.

    Parameters:
        collection_name (str): The name of the collection.
        entity_descriptions (dict): Mapping of entity names to their descriptions.
        collection_metadata (dict or str): Additional metadata for the collection.

    Returns:
        Any: The enriched collection description, as returned by the LLM.
    """
    prompt = self.build_collection_description_prompt(collection_name, entity_descriptions, collection_metadata)
    return self.execute_api_call(prompt, properties=self.properties, additional_data={})

enrich_database_description(database_name, collection_descriptions, database_metadata)

Enrich a database description using LLM.

Builds a prompt from the provided collection descriptions and metadata, then executes an API call to generate or refine the database-level description.

Parameters:

Name Type Description Default
database_name str

The name of the database.

required
collection_descriptions dict

Mapping of collection names to their descriptions.

required
database_metadata dict or str

Additional metadata for the database.

required

Returns:

Name Type Description
Any

The enriched database description.

Source code in blue/metadata.py
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def enrich_database_description(self, database_name, collection_descriptions, database_metadata):
    """
    Enrich a database description using LLM.

    Builds a prompt from the provided collection descriptions and metadata, then
    executes an API call to generate or refine the database-level description.

    Parameters:
        database_name (str): The name of the database.
        collection_descriptions (dict): Mapping of collection names to their descriptions.
        database_metadata (dict or str): Additional metadata for the database.

    Returns:
        Any: The enriched database description.
    """

    prompt = self.build_database_description_prompt(database_name, collection_descriptions, database_metadata)
    return self.execute_api_call(prompt, properties=self.properties, additional_data={})

enrich_entity(entity, attributes)

Generate an enriched description for an entity using its attributes.

Builds a prompt from the entity and its attributes, then calls the external LLM API to produce the enriched description.

Parameters:

Name Type Description Default
entity dict

The entity metadata to enrich.

required
attributes dict

Attribute data associated with the entity.

required

Returns:

Name Type Description
str

Enriched description text generated by the API.

Source code in blue/metadata.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def enrich_entity(self, entity, attributes):
    """
    Generate an enriched description for an entity using its attributes.

    Builds a prompt from the entity and its attributes, then calls the
    external LLM API to produce the enriched description.

    Parameters:
        entity (dict): The entity metadata to enrich.
        attributes (dict): Attribute data associated with the entity.

    Returns:
        str: Enriched description text generated by the API.
    """
    entity_prompt = self.build_entity_description_prompt(entity, attributes)
    return self.execute_api_call(entity_prompt, properties=self.properties, additional_data={})
Last update: 2025-10-09