Skip to content

Semantic project operator

SemanticProjectOperator

Bases: Operator, ServiceClient

Semantic Project Operator projects records to select and rename columns using LLM-based mapping resolution. Uses natural language instructions to determine which columns to keep and how to rename them.

Attributes:

Name Type Required Default Description
projection_instructions str None Natural language description of which columns to keep and how to rename them
Source code in blue/operators/semantic_project_operator.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class SemanticProjectOperator(Operator, ServiceClient):
    """
    Semantic Project Operator projects records to select and rename columns using LLM-based mapping resolution.
    Uses natural language instructions to determine which columns to keep and how to rename them.

    Attributes:
    ----------
    | Name                   | Type | Required | Default | Description                                                                 |
    |------------------------|------|----------|---------|-----------------------------------------------------------------------------|
    | `projection_instructions` | str  | :fontawesome-solid-circle-check: {.green-check}     | None    | Natural language description of which columns to keep and how to rename them |

    """

    MAPPING_PROMPT = """## Task
You are given a database schema with data types and natural language projection instructions. Your job is to generate a JSON mapping that specifies which columns to keep and how to rename them.

## Available Schema (with data types)
${schema}

## Projection Instructions
${projection_instructions}

## Output Requirements
- Return a **single JSON object** as the output
- The JSON object must contain key-value pairs where:
  - **Keys**: original column names from the schema (exact matches)
  - **Values**: new column names for the output
- **Column Selection Rules**:
  - Only include columns that should be kept in the output
  - If a column should be dropped, do not include it in the mapping
  - If a column should be renamed, map the old name to the new name
  - If a column should keep its original name, map it to itself
- **Validation Rules**:
  - All keys must exist in the provided schema
  - All values must be unique (no duplicate output column names)
  - All values must be valid column names (no special characters, extra spaces, etc.)
- **Examples**:
  - Keep and rename: `{"fname": "full_name", "lname": "last_name"}`
  - Keep without rename: `{"skills": "skills", "experience": "experience"}`
  - Drop column: simply don't include it in the mapping

## Additional Notes
- Interpret the natural language requirements carefully
- Be precise about which columns to keep vs drop
- Follow standard naming conventions for column names
- Return only the JSON object, no explanations or additional text

---

### Output JSON Object
"""

    PROPERTIES = {
        # openai related properties
        "openai.api": "ChatCompletion",
        "openai.model": "gpt-4o",
        "openai.stream": False,
        "openai.max_tokens": 1024,
        "openai.temperature": 0,
        # io related properties
        "input_json": "[{\"role\": \"user\"}]",
        "input_context": "$[0]",
        "input_context_field": "content",
        "input_field": "messages",
        "input_template": MAPPING_PROMPT,
        "output_path": "$.choices[0].message.content",
        # service related properties
        "service_prefix": "openai",
        # output transformations
        "output_transformations": [{"transformation": "replace", "from": "```", "to": ""}, {"transformation": "replace", "from": "json", "to": ""}],
        "output_strip": True,
        "output_cast": "json",
    }

    name = "semantic_project"
    description = "Projects records to select and rename columns using LLM-based mapping resolution"
    default_attributes = {
        "projection_instructions": {"type": "str", "description": "Natural language description of which columns to keep and how to rename them", "required": True},
    }

    def __init__(self, description: str = None, properties: Dict[str, Any] = None):
        super().__init__(
            self.name,
            function=semantic_project_operator_function,
            description=description or self.description,
            properties=properties,
            validator=semantic_project_operator_validator,
            explainer=semantic_project_operator_explainer,
        )

    def _initialize_properties(self):
        super()._initialize_properties()
        self.properties["attributes"] = self.default_attributes

        # service_url, set as default
        self.properties["service_url"] = PROPERTIES["services.openai.service_url"]

semantic_project_operator_explainer(output, input_data, attributes)

Generate explanation for semantic project operator execution.

Parameters:

Name Type Description Default
output Any

The output result from the operator execution.

required
input_data List[List[Dict[str, Any]]]

The input data that was processed.

required
attributes Dict[str, Any]

The attributes used for the operation.

required

Returns:

Type Description
Dict[str, Any]

Dictionary containing explanation of the operation.

Source code in blue/operators/semantic_project_operator.py
78
79
80
81
82
83
84
85
86
87
88
89
def semantic_project_operator_explainer(output: Any, input_data: List[List[Dict[str, Any]]], attributes: Dict[str, Any]) -> Dict[str, Any]:
    """Generate explanation for semantic project operator execution.

    Parameters:
        output: The output result from the operator execution.
        input_data: The input data that was processed.
        attributes: The attributes used for the operation.

    Returns:
        Dictionary containing explanation of the operation.
    """
    return default_operator_explainer(output, input_data, attributes)

semantic_project_operator_function(input_data, attributes, properties=None)

Project records to select and rename columns using LLM-based mapping resolution.

Parameters:

Name Type Description Default
input_data List[List[Dict[str, Any]]]

List of JSON arrays (List[List[Dict[str, Any]]]) containing records to project.

required
attributes Dict[str, Any]

Dictionary containing projection parameters including projection_instructions.

required
properties Dict[str, Any]

Optional properties dictionary containing service configuration. Defaults to None.

None

Returns:

Type Description
List[List[Dict[str, Any]]]

List containing projected records with selected and renamed columns.

Source code in blue/operators/semantic_project_operator.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def semantic_project_operator_function(input_data: List[List[Dict[str, Any]]], attributes: Dict[str, Any], properties: Dict[str, Any] = None) -> List[List[Dict[str, Any]]]:
    """Project records to select and rename columns using LLM-based mapping resolution.

    Parameters:
        input_data: List of JSON arrays (List[List[Dict[str, Any]]]) containing records to project.
        attributes: Dictionary containing projection parameters including projection_instructions.
        properties: Optional properties dictionary containing service configuration. Defaults to None.

    Returns:
        List containing projected records with selected and renamed columns.
    """
    projection_instructions = attributes.get('projection_instructions', '')

    if not input_data or not input_data[0]:
        return []

    if not projection_instructions:
        return []

    service_client = ServiceClient(name="semantic_project_operator_service_client", properties=properties)

    results = []
    for data_group in input_data:
        if not data_group:
            results.append([])
            continue

        # Generate column mapping using LLM
        schema = _extract_typed_schema(data_group)
        resolved_mapping = _resolve_column_mapping(schema, projection_instructions, service_client, properties)
        if not resolved_mapping:
            results.append([])
            continue

        # Apply projection with resolved mapping
        result = _apply_projection(data_group, resolved_mapping)
        results.append(result)

    return results

semantic_project_operator_validator(input_data, attributes, properties=None)

Validate semantic project operator attributes.

Parameters:

Name Type Description Default
input_data List[List[Dict[str, Any]]]

List of JSON arrays (List[List[Dict[str, Any]]]) to validate.

required
attributes Dict[str, Any]

Dictionary containing operator attributes to validate.

required
properties Dict[str, Any]

Optional properties dictionary. Defaults to None.

None

Returns:

Type Description
bool

True if attributes are valid, False otherwise.

Source code in blue/operators/semantic_project_operator.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def semantic_project_operator_validator(input_data: List[List[Dict[str, Any]]], attributes: Dict[str, Any], properties: Dict[str, Any] = None) -> bool:
    """Validate semantic project operator attributes.

    Parameters:
        input_data: List of JSON arrays (List[List[Dict[str, Any]]]) to validate.
        attributes: Dictionary containing operator attributes to validate.
        properties: Optional properties dictionary. Defaults to None.

    Returns:
        True if attributes are valid, False otherwise.
    """
    try:
        if not default_operator_validator(input_data, attributes, properties):
            return False
    except Exception:
        return False

    projection_instructions = attributes.get('projection_instructions', '')
    if not isinstance(projection_instructions, str) or not projection_instructions.strip():
        return False

    return True
Last update: 2025-10-08