-
Notifications
You must be signed in to change notification settings - Fork 92
feat: Filtering by the presence of a value in an array #873
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,174 @@ | ||||||
| package logstorage | ||||||
|
|
||||||
| import ( | ||||||
| "fmt" | ||||||
| "strings" | ||||||
|
|
||||||
| "github.com/VictoriaMetrics/VictoriaLogs/lib/prefixfilter" | ||||||
| "github.com/valyala/fastjson" | ||||||
| ) | ||||||
|
|
||||||
| // filterArrayContains matches if the JSON array in the given field contains the given value. | ||||||
| // | ||||||
| // Example LogsQL: `tags:array_contains("prod")` | ||||||
| type filterArrayContains struct { | ||||||
| fieldName string | ||||||
| value string | ||||||
| } | ||||||
|
|
||||||
| func (fa *filterArrayContains) String() string { | ||||||
| return fmt.Sprintf("%sarray_contains(%s)", quoteFieldNameIfNeeded(fa.fieldName), quoteTokenIfNeeded(fa.value)) | ||||||
| } | ||||||
|
|
||||||
| func (fa *filterArrayContains) updateNeededFields(pf *prefixfilter.Filter) { | ||||||
| pf.AddAllowFilter(fa.fieldName) | ||||||
| } | ||||||
|
|
||||||
| func (fa *filterArrayContains) matchRow(fields []Field) bool { | ||||||
| v := getFieldValueByName(fields, fa.fieldName) | ||||||
| return matchArrayContains(v, fa.value) | ||||||
| } | ||||||
|
|
||||||
| func (fa *filterArrayContains) applyToBlockResult(br *blockResult, bm *bitmap) { | ||||||
| c := br.getColumnByName(fa.fieldName) | ||||||
| if c.isConst { | ||||||
| v := c.valuesEncoded[0] | ||||||
| if !matchArrayContains(v, fa.value) { | ||||||
| bm.resetBits() | ||||||
| } | ||||||
| return | ||||||
| } | ||||||
| if c.isTime { | ||||||
| bm.resetBits() | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| switch c.valueType { | ||||||
| case valueTypeString: | ||||||
| values := c.getValues(br) | ||||||
| bm.forEachSetBit(func(idx int) bool { | ||||||
| v := values[idx] | ||||||
| return matchArrayContains(v, fa.value) | ||||||
| }) | ||||||
| case valueTypeDict: | ||||||
| bb := bbPool.Get() | ||||||
| for _, v := range c.dictValues { | ||||||
| c := byte(0) | ||||||
| if matchArrayContains(v, fa.value) { | ||||||
| c = 1 | ||||||
| } | ||||||
| bb.B = append(bb.B, c) | ||||||
| } | ||||||
| valuesEncoded := c.getValuesEncoded(br) | ||||||
| bm.forEachSetBit(func(idx int) bool { | ||||||
| n := valuesEncoded[idx][0] | ||||||
| return bb.B[n] == 1 | ||||||
| }) | ||||||
| bbPool.Put(bb) | ||||||
| default: | ||||||
| bm.resetBits() | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| func (fa *filterArrayContains) applyToBlockSearch(bs *blockSearch, bm *bitmap) { | ||||||
| fieldName := fa.fieldName | ||||||
| value := fa.value | ||||||
|
|
||||||
| v := bs.getConstColumnValue(fieldName) | ||||||
| if v != "" { | ||||||
| if !matchArrayContains(v, value) { | ||||||
| bm.resetBits() | ||||||
| } | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| // Verify whether filter matches other columns | ||||||
| ch := bs.getColumnHeader(fieldName) | ||||||
| if ch == nil { | ||||||
| // Fast path - there are no matching columns. | ||||||
| bm.resetBits() | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| switch ch.valueType { | ||||||
| case valueTypeString: | ||||||
| matchStringByArrayContains(bs, ch, bm, value) | ||||||
| case valueTypeDict: | ||||||
| matchValuesDictByArrayContains(bs, ch, bm, value) | ||||||
| default: | ||||||
| bm.resetBits() | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| func matchValuesDictByArrayContains(bs *blockSearch, ch *columnHeader, bm *bitmap, value string) { | ||||||
| bb := bbPool.Get() | ||||||
| for _, v := range ch.valuesDict.values { | ||||||
| c := byte(0) | ||||||
| if matchArrayContains(v, value) { | ||||||
| c = 1 | ||||||
| } | ||||||
| bb.B = append(bb.B, c) | ||||||
| } | ||||||
| matchEncodedValuesDict(bs, ch, bm, bb.B) | ||||||
| bbPool.Put(bb) | ||||||
| } | ||||||
|
|
||||||
| func matchStringByArrayContains(bs *blockSearch, ch *columnHeader, bm *bitmap, value string) { | ||||||
| visitValues(bs, ch, bm, func(v string) bool { | ||||||
| return matchArrayContains(v, value) | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
| func matchArrayContains(s, value string) bool { | ||||||
| if s == "" { | ||||||
| return false | ||||||
| } | ||||||
| // Fast check: if the value is not present as a substring, it definitely won't be in the array. | ||||||
| if !strings.Contains(s, value) { | ||||||
| return false | ||||||
| } | ||||||
|
|
||||||
| // Fast check 2: must start with [ | ||||||
| if s[0] != '[' { | ||||||
| return false | ||||||
| } | ||||||
|
|
||||||
| // Use shared fastjson.ParserPool in order to avoid per-call parser allocations. | ||||||
| p := jspp.Get() | ||||||
| defer jspp.Put(p) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is a super hot path, so it may be worth duplicating |
||||||
| v, err := p.Parse(s) | ||||||
| if err != nil { | ||||||
| return false | ||||||
| } | ||||||
|
|
||||||
| // Check if it is an array | ||||||
| a, err := v.Array() | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| if err != nil { | ||||||
| return false | ||||||
| } | ||||||
|
|
||||||
| for _, elem := range a { | ||||||
| // We only support checking against string representation of values in the array. | ||||||
| var sElem string | ||||||
| switch elem.Type() { | ||||||
| case fastjson.TypeString: | ||||||
| sElem = string(elem.GetStringBytes()) | ||||||
| case fastjson.TypeNumber: | ||||||
| sElem = elem.String() | ||||||
| case fastjson.TypeTrue: | ||||||
| sElem = "true" | ||||||
| case fastjson.TypeFalse: | ||||||
| sElem = "false" | ||||||
| case fastjson.TypeNull: | ||||||
| sElem = "null" | ||||||
| default: | ||||||
| continue | ||||||
| } | ||||||
|
|
||||||
| if sElem == value { | ||||||
| return true | ||||||
| } | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This loop's body is unoptimized. If you want to maximize performance, pass a byte buffer into this function. +, try using Also to follow conventions, you can use |
||||||
| } | ||||||
|
|
||||||
| return false | ||||||
| } | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| package logstorage | ||
|
|
||
| import ( | ||
| "testing" | ||
|
|
||
| "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" | ||
| ) | ||
|
|
||
| func TestMatchArrayContains(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| f := func(s, value string, resultExpected bool) { | ||
| t.Helper() | ||
| result := matchArrayContains(s, value) | ||
| if result != resultExpected { | ||
| t.Fatalf("unexpected result for s=%q, value=%q; got %v; want %v", s, value, result, resultExpected) | ||
| } | ||
| } | ||
|
|
||
| // Not an array | ||
| f("", "foo", false) | ||
| f("foo", "foo", false) | ||
| f("{}", "foo", false) | ||
|
|
||
| // Array doesn't contain value | ||
| f("[]", "foo", false) | ||
| f(`["bar"]`, "foo", false) | ||
| f(`["bar","baz"]`, "foo", false) | ||
| f(`[1,2]`, "3", false) | ||
|
|
||
| // Array contains value | ||
| f(`["foo"]`, "foo", true) | ||
| f(`["bar","foo"]`, "foo", true) | ||
| f(`["foo","bar"]`, "foo", true) | ||
| f(`["a","foo","b"]`, "foo", true) | ||
|
|
||
| // Mixed types | ||
| f(`[123]`, "123", true) | ||
| f(`[true]`, "true", true) | ||
| f(`["123"]`, "123", true) | ||
|
|
||
| // Tricky cases | ||
| f(`["foo bar"]`, "foo", false) // partial match | ||
| f(`["foobar"]`, "foo", false) // partial match | ||
| f(`["foo"]`, "fo", false) // partial match | ||
|
|
||
| // Nested structures (ignored by current implementation) | ||
| f(`[{"a":"b"}]`, `{"a":"b"}`, false) // nested object ignored | ||
| f(`[["a"]]`, `["a"]`, false) // nested array ignored | ||
| f(`[["a"], "b"]`, "b", true) // mixed with simple value | ||
| } | ||
|
|
||
| func TestFilterArrayContains(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| t.Run("const-column", func(t *testing.T) { | ||
| columns := []column{ | ||
| { | ||
| name: "foo", | ||
| values: []string{ | ||
| `["a","b"]`, | ||
| `["a","b"]`, | ||
| `["a","b"]`, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // match | ||
| fa := &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "a", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 2}) | ||
|
|
||
| fa = &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "b", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 1, 2}) | ||
|
|
||
| // mismatch | ||
| fa = &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "c", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", nil) | ||
|
|
||
| fa = &filterArrayContains{ | ||
| fieldName: "non-existing-column", | ||
| value: "a", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", nil) | ||
| }) | ||
|
|
||
| t.Run("dict", func(t *testing.T) { | ||
| columns := []column{ | ||
| { | ||
| name: "foo", | ||
| values: []string{ | ||
| "", | ||
| `["a"]`, | ||
| `["b"]`, | ||
| `["a","b"]`, | ||
| `"a"`, // not an array | ||
| `[1,2]`, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // match | ||
| fa := &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "a", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", []int{1, 3}) | ||
|
|
||
| fa = &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "b", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", []int{2, 3}) | ||
|
|
||
| // mismatch | ||
| fa = &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "c", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", nil) | ||
| }) | ||
|
|
||
| t.Run("strings", func(t *testing.T) { | ||
| columns := []column{ | ||
| { | ||
| name: "foo", | ||
| values: []string{ | ||
| `["apple", "banana"]`, | ||
| `["orange"]`, | ||
| `not array`, | ||
| `["apple"]`, | ||
| `[]`, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // match | ||
| fa := &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "apple", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", []int{0, 3}) | ||
|
|
||
| // mismatch | ||
| fa = &filterArrayContains{ | ||
| fieldName: "foo", | ||
| value: "pear", | ||
| } | ||
| testFilterMatchForColumns(t, columns, fa, "foo", nil) | ||
| }) | ||
|
|
||
| // Remove the remaining data files for the test | ||
| fs.MustRemoveDir(t.Name()) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2007,6 +2007,8 @@ func parseFilterGeneric(lex *lexer, fieldName string) (filter, error) { | |
| return parseFilterContainsAll(lex, fieldName) | ||
| case lex.isKeyword("contains_any"): | ||
| return parseFilterContainsAny(lex, fieldName) | ||
| case lex.isKeyword("array_contains"): | ||
| return parseFilterArrayContains(lex, fieldName) | ||
|
Comment on lines
+2010
to
+2011
|
||
| case lex.isKeyword("contains_common_case"): | ||
| return parseFilterContainsCommonCase(lex, fieldName) | ||
| case lex.isKeyword("eq_field"): | ||
|
|
@@ -2311,6 +2313,16 @@ func parseFilterContainsAny(lex *lexer, fieldName string) (filter, error) { | |
| return parseInValues(lex, fieldName, fi, &fi.values) | ||
| } | ||
|
|
||
| func parseFilterArrayContains(lex *lexer, fieldName string) (filter, error) { | ||
| return parseFuncArg(lex, fieldName, func(arg string) (filter, error) { | ||
| fa := &filterArrayContains{ | ||
| fieldName: getCanonicalColumnName(fieldName), | ||
| value: arg, | ||
| } | ||
| return fa, nil | ||
| }) | ||
| } | ||
|
|
||
| func parseFilterIn(lex *lexer, fieldName string) (filter, error) { | ||
| fi := &filterIn{ | ||
| fieldName: getCanonicalColumnName(fieldName), | ||
|
|
@@ -3803,6 +3815,7 @@ var reservedKeywords = func() map[string]struct{} { | |
| // functions | ||
| "contains_all", | ||
| "contains_any", | ||
| "array_contains", | ||
|
||
| "contains_common_case", | ||
| "eq_field", | ||
| "equals_common_case", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This fast path is probably not correct:
e.g., the json array contains the string
a\\\"b(which represents the actual valuea"bafter json unescaping), but the searched value isa"b