Lucene regex filter
This presentations contains an example of a filter with a Lucene conform regular expression. A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function.
Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression.
Set document and define concatenator process to test the filter
[1]:
import sys
sys.path.insert(0,"../../../../../")
import tempfile
from copy import deepcopy
from pathlib import Path
from unittest import mock
from logprep.factory import Factory
document = {
'data_stream': {
'dataset': 'windows',
'namespace': 'devopslab',
'type': '/logs/'
},
'_op_type': 'create'
}
expected = {
'data_stream': {
'dataset': 'windows',
'namespace': 'devopslab',
'type': 'logs'
},
'_op_type': 'create',
'_index': 'logs-windows-devopslab'
}
rule_path = Path(tempfile.gettempdir()) / "concatenator"
rule_path.mkdir(exist_ok=True)
rule_file = rule_path / "data-stream.yml"
if rule_file.exists():
rule_file.unlink()
processor_config = {
"myconcatenator":{
"type": "concatenator",
"rules": [str(rule_path), "/dev"],
}
}
concatenator = Factory.create(processor_config)
def concat_with_rule(rule_yaml):
mydocument = deepcopy(document)
if rule_file.exists():
rule_file.unlink()
rule_file.write_text(rule_yaml)
concatenator = Factory.create(processor_config)
print(f"before: {mydocument}")
concatenator.process(mydocument)
print(f"after: {mydocument}")
Former version with explicit regex_fields annotation
[2]:
rule_yaml = """---
filter: 'data_stream.type: ".*lo.*"'
regex_fields:
- "data_stream.type"
concatenator:
source_fields:
- data_stream.type
- data_stream.dataset
- data_stream.namespace
target_field: _index
separator: "-"
overwrite_target: false
delete_source_fields: false
"""
concat_with_rule(rule_yaml)
[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.
before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}
after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}
New Lucene conform version without the need of regex_fields
[3]:
rule_yaml = """---
filter: 'data_stream.type: /.*log.*/'
concatenator:
source_fields:
- data_stream.type
- data_stream.dataset
- data_stream.namespace
target_field: _index
separator: "-"
overwrite_target: false
delete_source_fields: false
"""
concat_with_rule(rule_yaml)
before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}
after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}
Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax.
[4]:
rule_yaml = """---
filter: 'data_stream.type: /\\/lo.*/'
concatenator:
source_fields:
- data_stream.type
- data_stream.dataset
- data_stream.namespace
target_field: _index
separator: "-"
overwrite_target: false
delete_source_fields: false
"""
concat_with_rule(rule_yaml)
before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}
after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}