18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370 | class NL2LLMAgent(Agent):
"""An agent that processes natural language queries using LLM models.
It can be configured to use a specific LLM source or discover available LLM sources in the data registry.
The agent sends the query to the selected LLM source and returns the response as structured data.
Properties (in addition to Agent properties):
----------
| Name | Type | Default | Description |
|----------------|--------------------|----------|---------|
| `nl2llm_source` | `str` | `None` | The name of the LLM source to use. If None, the agent will discover available LLM sources in the data registry. |
| `nl2llm_discovery` | `bool` | `True` | If True, the agent will search for any source that has "llm" as their protocol in the data registry. |
| `nl2llm_discovery_source_protocols` | `list of str` | `["openai"]` | List of protocols to search for. Should be changed to `["llm"]` after github issue #945 is resolved. |
| `nl2llm_context` | `list of str` | `[]` | Optional context to provide to the LLM source for query processing. |
| `nl2llm_attr_names` | `list of str` | `[]` | Optional attribute names to provide to the LLM source for query processing. |
| `nl2llm_output_filters` | `list of str` | `["all"]` | Output filters to apply to the result. Options are "all", "question", "source", "result", "error". |
| `nl2llm_output_max_results` | `int or None` | `None` | If not None, limits the number of records in the returned JSON array. |
Inputs:
- DEFAULT: natural language query
Outputs:
- DEFAULT: query results, tagged as `QUERY` and `NL`
"""
PROPERTIES = {
# agent related properties
"nl2llm_source": None,
"nl2llm_discovery": True, # if True, will search for any source that has "llm" as their protocol in the data registry, if false, will just use the "openai" source
"nl2llm_discovery_source_protocols": ["openai"], # list of protocols to search for, should be changed to ["llm"] after github issue #945 is resolved
"nl2llm_context": [],
"nl2llm_attr_names": [],
# output related properties
"nl2llm_output_filters": ["all"],
"nl2llm_output_max_results": None, # if not None, it will limit the number of records in returned json array
}
def __init__(self, **kwargs):
if 'name' not in kwargs:
kwargs['name'] = "NL2LLM"
super().__init__(**kwargs)
def _initialize_properties(self):
super()._initialize_properties()
# initialize default properties
for key in NL2LLMAgent.PROPERTIES:
self.properties[key] = NL2LLMAgent.PROPERTIES[key]
####### inputs / outputs
def _initialize_inputs(self):
"""Initialize input parameters for the NL2LLM agent."""
self.add_input("DEFAULT", description="natural language query")
def _initialize_outputs(self):
"""Initialize outputs for the NL2LLM agent, tags the output as QUERY and NL."""
self.add_output("DEFAULT", description="query results", tags=["QUERY", "NL"])
def _start(self):
"""Start the NL2LLM agent."""
self.logger.info("NL2LLMAgent _start() called")
super()._start()
# initialize registry
self.logger.info("Initializing registry...")
self._init_registry()
# initialize source
self.logger.info("Initializing source...")
self._init_source()
self.logger.info("NL2LLMAgent initialization complete")
def _init_registry(self):
"""Initialize the data registry."""
# create instance of data registry
platform_id = self.properties["platform.name"]
prefix = 'PLATFORM:' + platform_id
self.logger.info(f"Creating DataRegistry with id={self.properties['data_registry.name']}, prefix={prefix}")
self.registry = DataRegistry(id=self.properties['data_registry.name'], prefix=prefix, properties=self.properties)
self.logger.info("DataRegistry created successfully")
def _init_source(self):
"""Initialize the source for the agent."""
# initialize optional settings
self.selected_source = None
self.selected_source_protocol = None
self.selected_source_protocol_variant = None
# select source, if set
if "nl2llm_source" in self.properties and self.properties["nl2llm_source"]:
self.selected_source = self.properties["nl2llm_source"]
source_properties = self.registry.get_source_properties(self.selected_source)
if source_properties:
if 'connection' in source_properties:
connection_properties = source_properties["connection"]
protocol = connection_properties["protocol"]
if protocol:
self.selected_source_protocol = protocol
if 'protocol_variant' in source_properties:
self.selected_source_protocol_variant = source_properties.get("protocol_variant", None)
self.logger.info(f"selected source: {self.selected_source}")
self.logger.info(f"selected source protocol: {self.selected_source_protocol}")
self.logger.info(f"selected source protocol variant: {self.selected_source_protocol_variant}")
else:
## discover llm sources
scope = None
sources = self._search_sources(scope=scope)
self.logger.info(f"Found sources: {sources}")
# return only the first available source
if sources:
self.selected_source = sources[0]
source_properties = self.registry.get_source_properties(self.selected_source)
if source_properties and 'connection' in source_properties:
self.selected_source_protocol = source_properties['connection']['protocol']
self.selected_source_protocol_variant = source_properties['connection'].get('protocol_variant', None)
self.logger.info(f"selected source: {self.selected_source}")
self.logger.info(f"selected source protocol: {self.selected_source_protocol}")
self.logger.info(f"selected source protocol variant: {self.selected_source_protocol_variant}")
else:
self.logger.error(f"Source {self.selected_source} has no connection properties")
else:
self.logger.error("No sources found during discovery. Please check data registry configuration.")
# Set a default source for now
self.selected_source = "openai"
self.selected_source_protocol = "openai"
self.logger.info(f"Using default source: {self.selected_source}")
def _search_sources(self, scope=None):
"""Search the data registry for sources that match the question.
Parameters:
scope: The scope to search data registry within. If None, searches all scopes.
Returns:
A list of source names that match the search criteria."""
sources = []
if scope:
# search within specific scope
sources = self.registry.list_records(type='source', scope=scope, recursive=False)
else:
# search all sources
sources = self.registry.list_records(type='source', scope='/', recursive=False)
# filter sources by protocol if discovery protocols are specified
if 'nl2llm_discovery_source_protocols' in self.properties:
protocols = self.properties['nl2llm_discovery_source_protocols']
filtered_sources = []
for source_record in sources:
source_name = source_record.get('name')
if source_name:
source_properties = self.registry.get_source_properties(source_name)
if source_properties and 'connection' in source_properties:
connection_properties = source_properties['connection']
protocol = connection_properties.get('protocol')
if protocol in protocols:
filtered_sources.append(source_name)
sources = filtered_sources
return sources
def get_properties(self, properties=None):
"""Get properties for the NL2LLM agent.
Copied from RequestorAgent.get_properties().
Parameters:
properties: Optional properties dictionary to override agent properties. Defaults to None.
Returns:
A dictionary of merged properties.
"""
merged_properties = {}
# copy agent properties
for p in self.properties:
merged_properties[p] = self.properties[p]
# override
if properties is not None:
for p in properties:
merged_properties[p] = properties[p]
return merged_properties
def default_processor(self, message, input="DEFAULT", properties=None, worker=None):
"""Process incoming messages and execute LLM queries.
Parameters:
message: The message to process.
input: The input stream label.
properties: Additional properties for processing.
worker: The worker handling the processing.
Returns:
None or a response message.
"""
if message.isEOS():
# get all data received from stream
stream_data = ""
if worker:
stream_data = worker.get_data('stream')
# get properties, overriding with properties provided
properties = self.get_properties(properties=properties)
# process the accumulated data
if stream_data and len(stream_data) > 0:
input_data = stream_data[0]
# Only process if we have valid input data (not None, not empty)
if input_data and input_data.strip() != "":
self.logger.info(f"Processing accumulated input: {input_data}")
# process the query
result = self.process_query(input_data, properties=properties)
# write result to output stream
if worker:
worker.write_data(result)
else:
self.logger.info(f"Skipping processing for empty/null input: {input_data}")
else:
self.logger.info("No data accumulated in stream, skipping processing")
if worker:
worker.write_eos()
elif message.isBOS():
# init stream to empty array
if worker:
worker.set_data('stream', [])
pass
elif message.isData():
# store data value
data = message.getData()
self.logger.info(f"Accumulating data: {data}")
if worker:
worker.append_data('stream', str(data))
return None
def process_query(self, question, properties=None):
"""Process a natural language query using the selected LLM source.
Parameters:
question: The natural language question to process.
properties: Additional properties for processing.
Returns:
A dictionary containing the question, source, result, and any error encountered."""
properties = self.get_properties(properties=properties)
# Validate question - return None for empty/null questions
if question is None or question == "" or question.strip() == "":
self.logger.info(f"Skipping processing for empty/null question: {question}")
return None
# check if source is selected
if self.selected_source is None:
error = "No source selected. Please check data registry configuration."
self.logger.error(error)
return self._apply_filter({'question': question, 'source': None, 'result': None, 'error': error}, properties=properties)
try:
# initialize source if not already initialized
if self.selected_source is None:
self._init_source()
# connect to the source
source_connection = self.registry.connect_source(self.selected_source)
# execute query
self.logger.info(f"source: {self.selected_source}")
self.logger.info(f"executing query: {question}")
result = source_connection.execute_query(question, optional_properties={'context': properties.get('nl2llm_context', ""), 'attr_names': properties.get('nl2llm_attr_names', [])})
self.logger.info("result: " + str(result))
# apply output filters
filtered_result = self._apply_filter({'question': question, 'source': self.selected_source, 'result': result, 'error': None}, properties=properties)
return filtered_result
except Exception as e:
error = str(e)
self.logger.error(f"Error executing query: {error}")
# apply output filters
filtered_result = self._apply_filter({'question': question, 'source': self.selected_source, 'result': None, 'error': error}, properties=properties)
return filtered_result
def _apply_filter(self, output, properties=None):
"""Apply output filters to the result.
Parameters:
output: The output dictionary containing question, source, result, and error.
properties: Additional properties for processing.
Returns:
A filtered output based on the specified output filters.
"""
output_filters = ['all']
if 'nl2llm_output_filters' in self.properties:
output_filters = self.properties['nl2llm_output_filters']
question = output['question']
source = output['source']
result = output['result']
error = output['error']
# max results
if "nl2llm_output_max_results" in self.properties and self.properties['nl2llm_output_max_results']:
output_max_results = int(self.properties['nl2llm_output_max_results'])
if isinstance(result, list):
result = result[:output_max_results]
message = None
if 'all' in output_filters:
message = {'question': question, 'source': source, 'result': result, 'error': error}
return message
elif len(output_filters) == 1:
if 'question' in output_filters:
message = question
if 'source' in output_filters:
message = source
if 'error' in output_filters:
message = error
if 'result' in output_filters:
message = result
else:
message = {}
if 'question' in output_filters:
message['question'] = question
if 'source' in output_filters:
message['source'] = source
if 'result' in output_filters:
message['result'] = result
if 'error' in output_filters:
message['error'] = error
return message
|