From: Ralph Boehme Date: Mon, 5 Aug 2019 14:25:01 +0000 (+0200) Subject: s3:mdssvc: add Elasticsearch backend X-Git-Tag: talloc-2.3.1~443 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f5510d7db3816c6a18dc333e1ae7f505fca69815;p=thirdparty%2Fsamba.git s3:mdssvc: add Elasticsearch backend Signed-off-by: Ralph Boehme Reviewed-by: Noel Power --- diff --git a/docs-xml/smbdotconf/misc/elasticsearchaddress.xml b/docs-xml/smbdotconf/misc/elasticsearchaddress.xml new file mode 100644 index 00000000000..61125461a91 --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchaddress.xml @@ -0,0 +1,14 @@ + + + + Specifies the name of the Elasticsearch server to use for Spotlight + queries when using the Elasticsearch backend. + + + + localhost + needle.haystack.samba.org + diff --git a/docs-xml/smbdotconf/misc/elasticsearchindex.xml b/docs-xml/smbdotconf/misc/elasticsearchindex.xml new file mode 100644 index 00000000000..7f394b264f8 --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchindex.xml @@ -0,0 +1,16 @@ + + + + Specifies the name of the Elasticsearch index to use for Spotlight queries + when using the Elasticsearch backend. The default value of "_all" is a + special Elasticsearch value that performs the search operation on all + indices. + + + + _all + spotlight + diff --git a/docs-xml/smbdotconf/misc/elasticsearchmappings.xml b/docs-xml/smbdotconf/misc/elasticsearchmappings.xml new file mode 100644 index 00000000000..d2502a6fc3b --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchmappings.xml @@ -0,0 +1,14 @@ + + + + Path to a file specifying metadata attribute mappings in JSON format. Use + by the Elasticsearch backend of the Spotlight RPC service. + + + + &pathconfig.SAMBA_DATADIR;/elasticsearch_mappings.json + /usr/share/foo/mymappings.json + diff --git a/docs-xml/smbdotconf/misc/elasticsearchmaxresults.xml b/docs-xml/smbdotconf/misc/elasticsearchmaxresults.xml new file mode 100644 index 00000000000..1086b898ed8 --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchmaxresults.xml @@ -0,0 +1,15 @@ + + + + Path to a file specifying metadata attribute mappings in JSON format. Used + by the Elasticsearch backend of the Spotlight RPC service. A value of 0 + means no limit. + + + + 100 + 10 + diff --git a/docs-xml/smbdotconf/misc/elasticsearchport.xml b/docs-xml/smbdotconf/misc/elasticsearchport.xml new file mode 100644 index 00000000000..ea87daabc28 --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchport.xml @@ -0,0 +1,14 @@ + + + + Specifies the TCP port of the Elasticsearch server to use for Spotlight + queries when using the Elasticsearch backend. + + + + 9200 + 9201 + diff --git a/docs-xml/smbdotconf/misc/elasticsearchusetls.xml b/docs-xml/smbdotconf/misc/elasticsearchusetls.xml new file mode 100644 index 00000000000..e1aa8a31495 --- /dev/null +++ b/docs-xml/smbdotconf/misc/elasticsearchusetls.xml @@ -0,0 +1,14 @@ + + + + Specifies whether to use HTTPS when talking to the Elasticsearch server + used for Spotlight queries when using the Elasticsearch backend. + + + + no + yes + diff --git a/docs-xml/smbdotconf/misc/spotlight_backend.xml b/docs-xml/smbdotconf/misc/spotlight_backend.xml index 6d224b81e5f..0643fc16cbd 100644 --- a/docs-xml/smbdotconf/misc/spotlight_backend.xml +++ b/docs-xml/smbdotconf/misc/spotlight_backend.xml @@ -19,6 +19,10 @@ Gnome Tracker. + elasticsearch - + a backend that uses JSON and REST over HTTP(s) to query an + Elasticsearch server. + diff --git a/lib/param/loadparm.h b/lib/param/loadparm.h index 5c5b1cd7cd4..d0ce3d312e1 100644 --- a/lib/param/loadparm.h +++ b/lib/param/loadparm.h @@ -252,6 +252,7 @@ enum mangled_names_options {MANGLED_NAMES_NO, MANGLED_NAMES_YES, MANGLED_NAMES_I enum spotlight_backend_options { SPOTLIGHT_BACKEND_NOINDEX, SPOTLIGHT_BACKEND_TRACKER, + SPOTLIGHT_BACKEND_ES, }; /* diff --git a/lib/param/param_table.c b/lib/param/param_table.c index 82dc5cd6cda..2fd3361f996 100644 --- a/lib/param/param_table.c +++ b/lib/param/param_table.c @@ -354,6 +354,7 @@ static const struct enum_list enum_ntlm_auth[] = { static const struct enum_list enum_spotlight_backend[] = { {SPOTLIGHT_BACKEND_NOINDEX, "noindex"}, {SPOTLIGHT_BACKEND_TRACKER, "tracker"}, + {SPOTLIGHT_BACKEND_ES, "elasticsearch"}, {-1, NULL} }; diff --git a/source3/rpc_server/mdssvc/elasticsearch_mappings.json b/source3/rpc_server/mdssvc/elasticsearch_mappings.json new file mode 100644 index 00000000000..9f68a64925b --- /dev/null +++ b/source3/rpc_server/mdssvc/elasticsearch_mappings.json @@ -0,0 +1,142 @@ +{ + "attribute_mappings": { + "*": { + "type": "fts", + "attribute": "" + }, + "kMDItemTextContent": { + "type": "str", + "attribute": "content" + }, + "_kMDItemGroupId": { + "type": "type", + "attribute": "file.content_type" + }, + "kMDItemContentType": { + "type": "type", + "attribute": "file.content_type" + }, + "kMDItemContentTypeTree": { + "type": "type", + "attribute": "file.content_type" + }, + "kMDItemFSContentChangeDate": { + "type": "date", + "attribute": "file.last_modified" + }, + "kMDItemFSCreationDate": { + "type": "date", + "attribute": "file.created" + }, + "kMDItemFSName": { + "type": "str", + "attribute": "file.filename" + }, + "kMDItemFSOwnerGroupID": { + "type": "str", + "attribute": "attributes.owner" + }, + "kMDItemFSOwnerUserID": { + "type": "str", + "attribute": "attributes.group" + }, + "kMDItemFSSize": { + "type": "num", + "attribute": "file.filesize" + }, + "kMDItemPath": { + "type": "str", + "attribute": "path.real" + }, + "kMDItemAttributeChangeDate": { + "type": "date", + "attribute": "file.last_modified" + }, + "kMDItemAuthors": { + "type": "str", + "attribute": "meta.author" + }, + "kMDItemContentCreationDate": { + "type": "date", + "attribute": "file.created" + }, + "kMDItemContentModificationDate": { + "type": "date", + "attribute": "file.last_modified" + }, + "kMDItemCreator": { + "type": "str", + "attribute": "meta.raw.creator" + }, + "kMDItemDescription": { + "type": "str", + "attribute": "meta.raw.description" + }, + "kMDItemDisplayName": { + "type": "str", + "attribute": "file.filename" + }, + "kMDItemDurationSeconds": { + "type": "num", + "attribute": "meta.raw.xmpDM:duration" + }, + "kMDItemNumberOfPages": { + "type": "num", + "attribute": "meta.raw.xmpTPg:NPages" + }, + "kMDItemTitle": { + "type": "str", + "attribute": "meta.title" + }, + "kMDItemAlbum": { + "type": "str", + "attribute": "meta.raw.xmpDM:album" + }, + "kMDItemBitsPerSample": { + "type": "num", + "attribute": "meta.raw.tiff:BitsPerSample" + }, + "kMDItemPixelHeight": { + "type": "num", + "attribute": "meta.raw.Image Height" + }, + "kMDItemPixelWidth": { + "type": "num", + "attribute": "meta.raw.Image Width" + }, + "kMDItemResolutionHeightDPI": { + "type": "num", + "attribute": "meta.raw.Y Resolution" + }, + "kMDItemResolutionWidthDPI": { + "type": "num", + "attribute": "meta.raw.X Resolution" + } + }, + "mime_mappings": { + "1": "message/rfc822", + "2": "text/x-vcard", + "6": "text/x-vcard", + "7": "video/*", + "8": "application/octet-stream", + "9": "text/directory", + "10": "audio/*", + "11": "application/pdf", + "12": "application/vnd.oasis.opendocument.presentation", + "13": "image/*", + "public.content": "message/rfc822 application/pdf application/vnd.oasis.opendocument.presentation image/* text/*", + "public.jpeg": "image/jpeg", + "public.tiff": "image/tiff", + "com.compuserve.gif": "image/gif", + "public.png": "image/png", + "com.microsoft.bmp": "image/bmp", + "public.mp3": "audio/mpeg", + "public.mpeg-4-audio": "audio/x-aac", + "public.text": "text/*", + "public.plain-text": "text/plain", + "public.rtf": "text/rtf", + "public.html": "text/html", + "public.xml": "text/xml", + "public.archive": "application/zip application/x-bzip application/x-bzip2 application/x-tar application/x-7z-compressed" + } +} diff --git a/source3/rpc_server/mdssvc/es_lexer.l b/source3/rpc_server/mdssvc/es_lexer.l new file mode 100644 index 00000000000..4be42259f9c --- /dev/null +++ b/source3/rpc_server/mdssvc/es_lexer.l @@ -0,0 +1,92 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / Elasticsearch backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +%{ +#include "includes.h" +#include "rpc_server/mdssvc/es_parser.tab.h" + +#define YY_NO_INPUT +#define mdsyylalloc SMB_MALLOC +#define mdsyylrealloc SMB_REALLOC + +static char *strip_quote(const char *phrase); +%} + +%option nounput noyyalloc noyyrealloc prefix="mdsyyl" + +ASC [a-zA-Z0-9_\*\:\-\.] +U [\x80-\xbf] +U2 [\xc2-\xdf] +U3 [\xe0-\xef] +U4 [\xf0-\xf4] +SPECIAL [\!\#\$\%\&\'\(\)\+\,\.\/\;\<\=\>\?\@\[\]\^\`\{\}\|\~\\] +ESCHAR [\"\*] +BLANK [ \t\n] + +UANY {ASC}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +UPHRASE {UANY}|{SPECIAL}|{BLANK}|\\{ESCHAR} + +%% +InRange return FUNC_INRANGE; +\$time\.iso return DATE_ISO; +false {mdsyyllval.bval = false; return BOOLEAN;} +true {mdsyyllval.bval = true; return BOOLEAN;} +\" return QUOTE; +\( return OBRACE; +\) return CBRACE; +\&\& return AND; +\|\| return OR; +\=\= return EQUAL; +\!\= return UNEQUAL; +\= return EQUAL; +\< return LT; +\> return GT; +\, return COMMA; +{UANY}+ {mdsyyllval.sval = talloc_strdup(talloc_tos(), yytext); return WORD;} +\"{UPHRASE}+\" {mdsyyllval.sval = strip_quote(yytext); return PHRASE;} +{BLANK} /* ignore */ +%% + +static char *strip_quote(const char *phrase) +{ + size_t phrase_len = 0; + char *stripped_phrase = NULL; + + if (phrase == NULL) { + return NULL; + } + + phrase_len = strlen(phrase); + if (phrase_len < 2 || + phrase[0] != '\"' || + phrase[phrase_len - 1] != '\"') + { + return talloc_strdup(talloc_tos(), phrase); + } + + phrase++; + + stripped_phrase = talloc_strndup(talloc_tos(), phrase, phrase_len - 2); + if (stripped_phrase == NULL) { + return NULL; + } + return stripped_phrase; +} diff --git a/source3/rpc_server/mdssvc/es_mapping.c b/source3/rpc_server/mdssvc/es_mapping.c new file mode 100644 index 00000000000..5c71e503bf5 --- /dev/null +++ b/source3/rpc_server/mdssvc/es_mapping.c @@ -0,0 +1,241 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / Elasticsearch backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "includes.h" +#include "es_mapping.h" + +/* + * Escaping of special characters in Lucene query syntax across HTTP and JSON + * ========================================================================== + * + * These characters in Lucene queries need escaping [1]: + * + * + - & | ! ( ) { } [ ] ^ " ~ * ? : \ / + * + * Additionally JSON requires escaping of: + * + * " \ + * + * Characters already escaped by the mdssvc client: + * + * * " \ + * + * The following table contains the resulting escaped strings, beginning with the + * search term, the corresponding Spotlight query and the final string that gets + * sent to the target Elasticsearch server. + * + * string | mdfind | http + * -------+--------+------ + * x!x x!x x\\!x + * x&x x&x x\\&x + * x+x x+x x\\+x + * x-x x-x x\\-x + * x.x x.x x\\.x + * xx x>x x\\>x + * x=x x=x x\\=x + * x?x x?x x\\?x + * x[x x[x x\\[x + * x]x x]x x\\]x + * x^x x^x x\\^x + * x{x x{x x\\{x + * x}x x}x x\\}x + * x|x x|x x\\|x + * x x x x x\\ x + * x*x x\*x x\\*x + * x\x x\\x x\\\\x + * x"x x\"x x\\\"x + * + * Special cases: + * x y It's not possible to search for terms including spaces, Spotlight + * will search for x OR y. + * x(x Search for terms including ( and ) doesn not work with Spotlight. + * + * [1] + */ + +static char *escape_str(TALLOC_CTX *mem_ctx, + const char *in, + const char *escape_list, + const char *escape_exceptions) +{ + char *out = NULL; + size_t in_len; + size_t new_len; + size_t in_pos; + size_t out_pos = 0; + + if (in == NULL) { + return NULL; + } + in_len = strlen(in); + + if (escape_list == NULL) { + escape_list = ""; + } + if (escape_exceptions == NULL) { + escape_exceptions = ""; + } + + /* + * Allocate enough space for the worst case: every char needs to be + * escaped and requires an additional char. + */ + new_len = (in_len * 2) + 1; + if (new_len <= in_len) { + return NULL; + } + + out = talloc_zero_array(mem_ctx, char, new_len); + if (out == NULL) { + return NULL; + } + + for (in_pos = 0, out_pos = 0; in_pos < in_len; in_pos++, out_pos++) { + if (strchr(escape_list, in[in_pos]) != NULL && + strchr(escape_exceptions, in[in_pos]) == NULL) + { + out[out_pos++] = '\\'; + } + out[out_pos] = in[in_pos]; + } + + return out; +} + +char *es_escape_str(TALLOC_CTX *mem_ctx, + const char *in, + const char *exceptions) +{ + const char *lucene_escape_list = "+-&|!(){}[]^\"~*?:\\/ "; + const char *json_escape_list = "\\\""; + char *lucene_escaped = NULL; + char *full_escaped = NULL; + + lucene_escaped = escape_str(mem_ctx, + in, + lucene_escape_list, + exceptions); + if (lucene_escaped == NULL) { + return NULL; + } + + full_escaped = escape_str(mem_ctx, + lucene_escaped, + json_escape_list, + NULL); + TALLOC_FREE(lucene_escaped); + return full_escaped; +} + +struct es_attr_map *es_map_sl_attr(TALLOC_CTX *mem_ctx, + json_t *kmd_map, + const char *sl_attr) +{ + struct es_attr_map *es_map = NULL; + const char *typestr = NULL; + enum ssm_type type; + char *es_attr = NULL; + size_t i; + int cmp; + int ret; + + static struct { + const char *typestr; + enum ssm_type typeval; + } ssmt_type_map[] = { + {"bool", ssmt_bool}, + {"num", ssmt_num}, + {"str", ssmt_str}, + {"fts", ssmt_fts}, + {"date", ssmt_date}, + {"type", ssmt_type}, + }; + + if (sl_attr == NULL) { + return NULL; + } + + ret = json_unpack(kmd_map, + "{s: {s: s}}", + sl_attr, + "type", + &typestr); + if (ret != 0) { + DBG_ERR("No JSON type mapping for [%s]\n", sl_attr); + return NULL; + } + + ret = json_unpack(kmd_map, + "{s: {s: s}}", + sl_attr, + "attribute", + &es_attr); + if (ret != 0) { + DBG_ERR("No JSON attribute mapping for [%s]\n", sl_attr); + return NULL; + } + + for (i = 0; i < ARRAY_SIZE(ssmt_type_map); i++) { + cmp = strcmp(typestr, ssmt_type_map[i].typestr); + if (cmp == 0) { + type = ssmt_type_map[i].typeval; + break; + } + } + if (i == ARRAY_SIZE(ssmt_type_map)) { + return NULL; + } + + es_map = talloc_zero(mem_ctx, struct es_attr_map); + if (es_map == NULL) { + return NULL; + } + es_map->type = type; + + es_map->name = es_escape_str(es_map, es_attr, NULL); + if (es_map->name == NULL) { + TALLOC_FREE(es_map); + return false; + } + + return es_map; +} + +const char *es_map_sl_type(json_t *mime_map, + const char *sl_type) +{ + const char *mime_type = NULL; + int ret; + + if (sl_type == NULL) { + return NULL; + } + + ret = json_unpack(mime_map, + "{s: s}", + sl_type, + &mime_type); + if (ret != 0) { + return NULL; + } + + return mime_type; +} diff --git a/source3/rpc_server/mdssvc/es_mapping.h b/source3/rpc_server/mdssvc/es_mapping.h new file mode 100644 index 00000000000..29511b52630 --- /dev/null +++ b/source3/rpc_server/mdssvc/es_mapping.h @@ -0,0 +1,49 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / Elasticsearch backend + + Copyright (c) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _ES_MAPPING_H_ +#define _ES_MAPPING_H_ + +#include + +enum ssm_type { + ssmt_bool, /* a boolean value */ + ssmt_num, /* a numeric value */ + ssmt_str, /* a string value */ + ssmt_fts, /* a string value */ + ssmt_date, /* date values */ + ssmt_type /* kMDItemContentType, requires special mapping */ +}; + +struct es_attr_map { + enum ssm_type type; + const char *name; +}; + +char *es_escape_str(TALLOC_CTX *mem_ctx, + const char *in, + const char *exceptions); +struct es_attr_map *es_map_sl_attr(TALLOC_CTX *mem_ctx, + json_t *kmd_map, + const char *sl_attr); +const char *es_map_sl_type(json_t *mime_map, + const char *sl_type); + +#endif diff --git a/source3/rpc_server/mdssvc/es_parser.y b/source3/rpc_server/mdssvc/es_parser.y new file mode 100644 index 00000000000..0514183b35d --- /dev/null +++ b/source3/rpc_server/mdssvc/es_parser.y @@ -0,0 +1,625 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / Elasticsearch backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +%{ + #include "includes.h" + #include "rpc_server/mdssvc/mdssvc.h" + #include "rpc_server/mdssvc/mdssvc_es.h" + #include "rpc_server/mdssvc/es_parser.tab.h" + #include "rpc_server/mdssvc/es_mapping.h" + #include + + /* + * allow building with -O3 -Wp,-D_FORTIFY_SOURCE=2 + * + * /tmp/samba-testbase/.../mdssvc/es_parser.y: In function + * ‘mdsyylparse’: + * es_parser.tab.c:1124:6: error: assuming pointer wraparound + * does not occur when comparing P +- C1 with P +- C2 + * [-Werror=strict-overflow] + * + * The generated code in es_parser.tab.c looks like this: + * + * if (yyss + yystacksize - 1 <= yyssp) + */ + #pragma GCC diagnostic ignored "-Wstrict-overflow" + + #define YYMALLOC SMB_MALLOC + #define YYREALLOC SMB_REALLOC + + struct yy_buffer_state; + typedef struct yy_buffer_state *YY_BUFFER_STATE; + int mdsyyllex(void); + void mdsyylerror(char const *); + void *mdsyylterminate(void); + YY_BUFFER_STATE mdsyyl_scan_string(const char *str); + void mdsyyl_delete_buffer(YY_BUFFER_STATE buffer); + + /* forward declarations */ + static char *isodate_to_sldate(const char *s); + static char *map_expr(const struct es_attr_map *attr, + char op, + const char *val1, + const char *val2); + + /* global vars, eg needed by the lexer */ + struct es_parser_state { + TALLOC_CTX *frame; + json_t *kmd_map; + json_t *mime_map; + YY_BUFFER_STATE s; + const char *result; + } *global_es_parser_state; +%} + +%code provides { + #include + #include + #include "rpc_server/mdssvc/mdssvc.h" + + /* 2001-01-01T00:00:00Z - Unix Epoch = SP_RAW_TIME_OFFSET */ + #define SP_RAW_TIME_OFFSET 978307200 + + int mdsyylwrap(void); + bool map_spotlight_to_es_query(TALLOC_CTX *mem_ctx, + json_t *mappings, + const char *path_scope, + const char *query_string, + char **_es_query); +} + +%union { + bool bval; + const char *sval; + struct es_attr_map *attr_map; +} + +%name-prefix "mdsyyl" +%expect 1 +%error-verbose + +%type match expr line function value isodate +%type attribute + +%token WORD PHRASE +%token BOOLEAN +%token FUNC_INRANGE +%token DATE_ISO +%token OBRACE CBRACE EQUAL UNEQUAL GT LT COMMA QUOTE +%left OR +%left AND +%% + +input: +/* empty */ +| input line +; + +line: +expr { + global_es_parser_state->result = $1; +} +; + +expr: +OBRACE expr CBRACE { + if ($2 == NULL) YYABORT; + $$ = talloc_asprintf(talloc_tos(), "(%s)", $2); + if ($$ == NULL) YYABORT; +} +| expr AND expr { + $$ = talloc_asprintf(talloc_tos(), "(%s) AND (%s)", $1, $3); + if ($$ == NULL) YYABORT; +} +| expr OR expr { + $$ = talloc_asprintf(talloc_tos(), "%s OR %s", $1, $3); + if ($$ == NULL) YYABORT; +} +| match { + $$ = $1; +} +| BOOLEAN { + /* + * We can't properly handle these in expressions, fortunately this + * is probably only ever used by OS X as sole element in an + * expression ie "False" (when Finder window selected our share + * but no search string entered yet). Packet traces showed that OS + * X Spotlight server then returns a failure (ie -1) which is what + * we do here too by calling YYABORT. + */ + YYABORT; +}; + +match: +attribute EQUAL value { + $$ = map_expr($1, '=', $3, NULL); + if ($$ == NULL) YYABORT; +} +| attribute UNEQUAL value { + $$ = map_expr($1, '!', $3, NULL); + if ($$ == NULL) YYABORT; +} +| attribute LT value { + $$ = map_expr($1, '<', $3, NULL); + if ($$ == NULL) YYABORT; +} +| attribute GT value { + $$ = map_expr($1, '>', $3, NULL); + if ($$ == NULL) YYABORT; +} +| function { + $$ = $1; +} +| match WORD { + $$ = $1; +}; + +function: +FUNC_INRANGE OBRACE attribute COMMA WORD COMMA WORD CBRACE { + $$ = map_expr($3, '~', $5, $7); + if ($$ == NULL) YYABORT; +}; + +attribute: +WORD { + $$ = es_map_sl_attr(global_es_parser_state->frame, + global_es_parser_state->kmd_map, + $1); + if ($$ == NULL) YYABORT; +}; + +value: +PHRASE { + $$ = $1; +} +| isodate { + $$ = $1; +}; + +isodate: +DATE_ISO OBRACE WORD CBRACE { + $$ = isodate_to_sldate($3); + if ($$ == NULL) YYABORT; +}; + +%% + +/* + * Spotlight has two date formats: + * - seconds since 2001-01-01 00:00:00Z + * - as string "$time.iso(%Y-%m-%dT%H:%M:%SZ)" + * This function converts the latter to the former as string, so the parser + * can work on a uniform format. + */ +static char *isodate_to_sldate(const char *isodate) +{ + struct es_parser_state *s = global_es_parser_state; + struct tm tm; + const char *p = NULL; + char *tstr = NULL; + time_t t; + + p = strptime(isodate, "%Y-%m-%dT%H:%M:%SZ", &tm); + if (p == NULL) { + DBG_ERR("strptime [%s] failed\n", isodate); + return NULL; + } + + t = timegm(&tm); + t -= SP_RAW_TIME_OFFSET; + + tstr = talloc_asprintf(s->frame, "%jd", (intmax_t)t); + if (tstr == NULL) { + return NULL; + } + + return tstr; +} + +static char *map_type(const struct es_attr_map *attr, + char op, + const char *val) +{ + struct es_parser_state *s = global_es_parser_state; + const char *mime_type_list = NULL; + char *esc_mime_type_list = NULL; + const char *not = NULL; + const char *end = NULL; + char *es = NULL; + + mime_type_list = es_map_sl_type(s->mime_map, val); + if (mime_type_list == NULL) { + DBG_ERR("Mapping type [%s] failed\n", val); + return NULL; + } + + esc_mime_type_list = es_escape_str(s->frame, + mime_type_list, + "* "); + if (esc_mime_type_list == NULL) { + return NULL; + } + + switch (op) { + case '=': + not = ""; + end = ""; + break; + case '!': + not = "(NOT "; + end = ")"; + break; + default: + DBG_ERR("Mapping type [%s] unexpected op [%c]\n", val, op); + return NULL; + } + es = talloc_asprintf(s->frame, + "%s%s:(%s)%s", + not, + attr->name, + esc_mime_type_list, + end); + if (es == NULL) { + return NULL; + } + + return es; +} + +static char *map_num(const struct es_attr_map *attr, + char op, + const char *val1, + const char *val2) +{ + struct es_parser_state *s = global_es_parser_state; + char *es = NULL; + + switch (op) { + case '>': + es = talloc_asprintf(s->frame, + "%s:{%s TO *}", + attr->name, + val1); + break; + case '<': + es = talloc_asprintf(s->frame, + "%s:{* TO %s}", + attr->name, + val1); + break; + case '~': + es = talloc_asprintf(s->frame, + "%s:[%s TO %s]", + attr->name, + val1, + val2); + break; + case '=': + es = talloc_asprintf(s->frame, + "%s:%s", + attr->name, + val1); + break; + case '!': + es = talloc_asprintf(s->frame, + "(NOT %s:%s)", + attr->name, + val1); + break; + default: + DBG_ERR("Mapping num unexpected op [%c]\n", op); + return NULL; + } + if (es == NULL) { + return NULL; + } + + return es; +} + +static char *map_fts(const struct es_attr_map *attr, + char op, + const char *val) +{ + struct es_parser_state *s = global_es_parser_state; + const char *not = NULL; + const char *end = NULL; + char *esval = NULL; + char *es = NULL; + + esval = es_escape_str(s->frame, val, "*\\\""); + if (esval == NULL) { + yyerror("es_escape_str failed"); + return NULL; + } + + switch (op) { + case '=': + not = ""; + end = ""; + break; + case '!': + not = "(NOT "; + end = ")"; + break; + default: + DBG_ERR("Mapping fts [%s] unexpected op [%c]\n", val, op); + return NULL; + } + es = talloc_asprintf(s->frame, + "%s%s%s", + not, + esval, + end); + if (es == NULL) { + return NULL; + } + return es; +} + +static char *map_str(const struct es_attr_map *attr, + char op, + const char *val) +{ + struct es_parser_state *s = global_es_parser_state; + char *esval = NULL; + char *es = NULL; + const char *not = NULL; + const char *end = NULL; + + esval = es_escape_str(s->frame, val, "*\\\""); + if (esval == NULL) { + yyerror("es_escape_str failed"); + return NULL; + } + + switch (op) { + case '=': + not = ""; + end = ""; + break; + case '!': + not = "(NOT "; + end = ")"; + break; + default: + DBG_ERR("Mapping string [%s] unexpected op [%c]\n", val, op); + return NULL; + } + + es = talloc_asprintf(s->frame, + "%s%s:%s%s", + not, + attr->name, + esval, + end); + if (es == NULL) { + return NULL; + } + return es; +} + +/* + * Convert Spotlight date seconds since 2001-01-01 00:00:00Z + * to a date string in the format %Y-%m-%dT%H:%M:%SZ. + */ +static char *map_sldate_to_esdate(TALLOC_CTX *mem_ctx, + const char *sldate) +{ + struct tm *tm = NULL; + char *esdate = NULL; + char buf[21]; + size_t len; + time_t t; + int error; + + t = (time_t)smb_strtoull(sldate, NULL, 10, &error, SMB_STR_STANDARD); + if (error != 0) { + DBG_ERR("smb_strtoull [%s] failed\n", sldate); + return NULL; + } + t += SP_RAW_TIME_OFFSET; + + tm = gmtime(&t); + if (tm == NULL) { + DBG_ERR("localtime [%s] failed\n", sldate); + return NULL; + } + + len = strftime(buf, sizeof(buf), + "%Y-%m-%dT%H:%M:%SZ", tm); + if (len != 20) { + DBG_ERR("strftime [%s] failed\n", sldate); + return NULL; + } + + esdate = es_escape_str(mem_ctx, buf, NULL); + if (esdate == NULL) { + yyerror("es_escape_str failed"); + return NULL; + } + return esdate; +} + +static char *map_date(const struct es_attr_map *attr, + char op, + const char *sldate1, + const char *sldate2) +{ + struct es_parser_state *s = global_es_parser_state; + char *esdate1 = NULL; + char *esdate2 = NULL; + char *es = NULL; + + if (op == '~' && sldate2 == NULL) { + DBG_ERR("Date range query, but second date is NULL\n"); + return NULL; + } + + esdate1 = map_sldate_to_esdate(s->frame, sldate1); + if (esdate1 == NULL) { + DBG_ERR("map_sldate_to_esdate [%s] failed\n", sldate1); + return NULL; + } + if (sldate2 != NULL) { + esdate2 = map_sldate_to_esdate(s->frame, sldate2); + if (esdate2 == NULL) { + DBG_ERR("map_sldate_to_esdate [%s] failed\n", sldate2); + return NULL; + } + } + + switch (op) { + case '>': + es = talloc_asprintf(s->frame, + "%s:{%s TO *}", + attr->name, + esdate1); + break; + case '<': + es = talloc_asprintf(s->frame, + "%s:{* TO %s}", + attr->name, + esdate1); + break; + case '~': + es = talloc_asprintf(s->frame, + "%s:[%s TO %s]", + attr->name, + esdate1, + esdate2); + break; + case '=': + es = talloc_asprintf(s->frame, + "%s:%s", + attr->name, + esdate1); + break; + case '!': + es = talloc_asprintf(s->frame, + "(NOT %s:%s)", + attr->name, + esdate1); + break; + } + if (es == NULL) { + return NULL; + } + return es; +} + +static char *map_expr(const struct es_attr_map *attr, + char op, + const char *val1, + const char *val2) +{ + char *es = NULL; + + switch (attr->type) { + case ssmt_type: + es = map_type(attr, op, val1); + break; + case ssmt_num: + es = map_num(attr, op, val1, val2); + break; + case ssmt_fts: + es = map_fts(attr, op, val1); + break; + case ssmt_str: + es = map_str(attr, op, val1); + break; + case ssmt_date: + es = map_date(attr, op, val1, val2); + break; + default: + break; + } + if (es == NULL) { + DBG_ERR("Mapping [%s %c %s (%s)] failed\n", + attr->name, op, val1, val2 ? val2 : ""); + return NULL; + } + + return es; +} + +void mdsyylerror(const char *str) +{ + DBG_ERR("Parser failed: %s\n", str); +} + +int mdsyylwrap(void) +{ + return 1; +} + +/** + * Map a Spotlight RAW query string to a ES query string + **/ +bool map_spotlight_to_es_query(TALLOC_CTX *mem_ctx, + json_t *mappings, + const char *path_scope, + const char *query_string, + char **_es_query) +{ + struct es_parser_state s = { + .frame = talloc_stackframe(), + }; + int result; + char *es_query = NULL; + + s.kmd_map = json_object_get(mappings, "attribute_mappings"); + if (s.kmd_map == NULL) { + DBG_ERR("Failed to load attribute_mappings from JSON\n"); + return false; + } + s.mime_map = json_object_get(mappings, "mime_mappings"); + if (s.mime_map == NULL) { + DBG_ERR("Failed to load mime_mappings from JSON\n"); + return false; + } + + s.s = mdsyyl_scan_string(query_string); + if (s.s == NULL) { + DBG_WARNING("Failed to parse [%s]\n", query_string); + TALLOC_FREE(s.frame); + return false; + } + global_es_parser_state = &s; + result = mdsyylparse(); + global_es_parser_state = NULL; + mdsyyl_delete_buffer(s.s); + + if (result != 0) { + TALLOC_FREE(s.frame); + return false; + } + + es_query = talloc_asprintf(mem_ctx, + "(%s) AND path.real.fulltext:\\\"%s\\\"", + s.result, path_scope); + TALLOC_FREE(s.frame); + if (es_query == NULL) { + return false; + } + + *_es_query = es_query; + return true; +} diff --git a/source3/rpc_server/mdssvc/es_parser_test.c b/source3/rpc_server/mdssvc/es_parser_test.c new file mode 100644 index 00000000000..5751606fa1e --- /dev/null +++ b/source3/rpc_server/mdssvc/es_parser_test.c @@ -0,0 +1,99 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / ES backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "includes.h" +#include "rpc_server/mdssvc/mdssvc.h" +#include "rpc_server/mdssvc/mdssvc_es.h" +#include "rpc_server/mdssvc/es_parser.tab.h" +#include "rpc_server/mdssvc/es_mapping.h" + +/* + * Examples: + * + * $ ./spotlight2es '_kMDItemGroupId=="11"' + * ... + * $ ./spotlight2es '*=="test*"||kMDItemTextContent=="test*"' + * ... + */ + +int main(int argc, char **argv) +{ + TALLOC_CTX *mem_ctx = NULL; + json_t *mappings = NULL; + json_error_t json_error; + char *default_path = NULL; + char *path = NULL; + const char *query_string = NULL; + const char *path_scope = NULL; + char *es_query = NULL; + bool ok; + + if (argc != 2) { + printf("usage: %s QUERY\n", argv[0]); + return 1; + } + query_string = argv[1]; + path_scope = "/foo/bar"; + + lp_load_global(get_dyn_CONFIGFILE()); + + mem_ctx = talloc_init("es_parser_test"); + if (mem_ctx == NULL) { + return 1; + } + + default_path = talloc_asprintf(mem_ctx, + "%s/mdssvc/elasticsearch_mappings.json", + get_dyn_SAMBA_DATADIR()); + if (default_path == NULL) { + TALLOC_FREE(mem_ctx); + return 1; + } + + path = lp_parm_talloc_string(mem_ctx, + GLOBAL_SECTION_SNUM, + "elasticsearch", + "mappings", + default_path); + TALLOC_FREE(default_path); + if (path == NULL) { + TALLOC_FREE(mem_ctx); + return 1; + } + + mappings = json_load_file(path, 0, &json_error); + if (mappings == NULL) { + DBG_ERR("Opening mapping file [%s] failed: %s\n", + path, strerror(errno)); + TALLOC_FREE(mem_ctx); + return 1; + } + + ok = map_spotlight_to_es_query(mem_ctx, + mappings, + path_scope, + query_string, + &es_query); + printf("%s\n", ok ? es_query : "*mapping failed*"); + + json_decref(mappings); + talloc_free(mem_ctx); + return ok ? 0 : 1; +} diff --git a/source3/rpc_server/mdssvc/mdssvc.c b/source3/rpc_server/mdssvc/mdssvc.c index 24183329301..fce3335d602 100644 --- a/source3/rpc_server/mdssvc/mdssvc.c +++ b/source3/rpc_server/mdssvc/mdssvc.c @@ -31,6 +31,9 @@ #ifdef HAVE_SPOTLIGHT_BACKEND_TRACKER #include "mdssvc_tracker.h" #endif +#ifdef HAVE_SPOTLIGHT_BACKEND_ES +#include "mdssvc_es.h" +#endif #undef DBGC_CLASS #define DBGC_CLASS DBGC_RPC_SRV @@ -1422,6 +1425,15 @@ static struct mdssvc_ctx *mdssvc_init(struct tevent_context *ev) return NULL; } +#ifdef HAVE_SPOTLIGHT_BACKEND_ES + ok = mdsscv_backend_es.init(mdssvc_ctx); + if (!ok) { + DBG_ERR("backend init failed\n"); + TALLOC_FREE(mdssvc_ctx); + return NULL; + } +#endif + #ifdef HAVE_SPOTLIGHT_BACKEND_TRACKER ok = mdsscv_backend_tracker.init(mdssvc_ctx); if (!ok) { @@ -1457,6 +1469,14 @@ bool mds_shutdown(void) if (!ok) { goto fail; } + +#ifdef HAVE_SPOTLIGHT_BACKEND_ES + ok = mdsscv_backend_es.shutdown(mdssvc_ctx); + if (!ok) { + goto fail; + } +#endif + #ifdef HAVE_SPOTLIGHT_BACKEND_TRACKER ok = mdsscv_backend_tracker.shutdown(mdssvc_ctx); if (!ok) { @@ -1528,6 +1548,13 @@ struct mds_ctx *mds_init_ctx(TALLOC_CTX *mem_ctx, case SPOTLIGHT_BACKEND_NOINDEX: mds_ctx->backend = &mdsscv_backend_noindex; break; + +#ifdef HAVE_SPOTLIGHT_BACKEND_ES + case SPOTLIGHT_BACKEND_ES: + mds_ctx->backend = &mdsscv_backend_es; + break; +#endif + #ifdef HAVE_SPOTLIGHT_BACKEND_TRACKER case SPOTLIGHT_BACKEND_TRACKER: mds_ctx->backend = &mdsscv_backend_tracker; diff --git a/source3/rpc_server/mdssvc/mdssvc_es.c b/source3/rpc_server/mdssvc/mdssvc_es.c new file mode 100644 index 00000000000..3c54abf01fd --- /dev/null +++ b/source3/rpc_server/mdssvc/mdssvc_es.c @@ -0,0 +1,835 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / ES backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "includes.h" +#include "system/filesys.h" +#include "lib/util/time_basic.h" +#include "lib/tls/tls.h" +#include "lib/util/tevent_ntstatus.h" +#include "libcli/http/http.h" +#include "lib/util/tevent_unix.h" +#include "credentials.h" +#include "mdssvc.h" +#include "mdssvc_es.h" +#include "rpc_server/mdssvc/es_parser.tab.h" + +#include + +#undef DBGC_CLASS +#define DBGC_CLASS DBGC_RPC_SRV + +#define MDSSVC_ELASTIC_QUERY_TEMPLATE \ + "{" \ + " \"from\": %zu," \ + " \"size\": %zu," \ + " \"_source\": [%s]," \ + " \"query\": {" \ + " \"query_string\": {" \ + " \"query\": \"%s\"" \ + " }" \ + " }" \ + "}" + +#define MDSSVC_ELASTIC_SOURCES \ + "\"path.real\"" + +static bool mdssvc_es_init(struct mdssvc_ctx *mdssvc_ctx) +{ + struct mdssvc_es_ctx *mdssvc_es_ctx = NULL; + json_error_t json_error; + char *default_path = NULL; + char *path = NULL; + + mdssvc_es_ctx = talloc_zero(mdssvc_ctx, struct mdssvc_es_ctx); + if (mdssvc_es_ctx == NULL) { + return false; + } + mdssvc_es_ctx->mdssvc_ctx = mdssvc_ctx; + + mdssvc_es_ctx->creds = cli_credentials_init_anon(mdssvc_es_ctx); + if (mdssvc_es_ctx->creds == NULL) { + TALLOC_FREE(mdssvc_es_ctx); + return false; + } + + default_path = talloc_asprintf( + mdssvc_es_ctx, + "%s/mdssvc/elasticsearch_mappings.json", + get_dyn_SAMBA_DATADIR()); + if (default_path == NULL) { + TALLOC_FREE(mdssvc_es_ctx); + return false; + } + + path = lp_parm_talloc_string(mdssvc_es_ctx, + GLOBAL_SECTION_SNUM, + "elasticsearch", + "mappings", + default_path); + TALLOC_FREE(default_path); + if (path == NULL) { + TALLOC_FREE(mdssvc_es_ctx); + return false; + } + + mdssvc_es_ctx->mappings = json_load_file(path, 0, &json_error); + if (mdssvc_es_ctx->mappings == NULL) { + DBG_ERR("Opening mapping file [%s] failed: %s\n", + path, json_error.text); + TALLOC_FREE(path); + TALLOC_FREE(mdssvc_es_ctx); + return false; + } + TALLOC_FREE(path); + + mdssvc_ctx->backend_private = mdssvc_es_ctx; + return true; +} + +static bool mdssvc_es_shutdown(struct mdssvc_ctx *mdssvc_ctx) +{ + return true; +} + +static struct tevent_req *mds_es_connect_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct mds_es_ctx *mds_es_ctx); +static int mds_es_connect_recv(struct tevent_req *req); +static void mds_es_connected(struct tevent_req *subreq); +static bool mds_es_next_search_trigger(struct mds_es_ctx *mds_es_ctx); + +static bool mds_es_connect(struct mds_ctx *mds_ctx) +{ + struct mdssvc_es_ctx *mdssvc_es_ctx = talloc_get_type_abort( + mds_ctx->mdssvc_ctx->backend_private, struct mdssvc_es_ctx); + struct mds_es_ctx *mds_es_ctx = NULL; + struct tevent_req *subreq = NULL; + + mds_es_ctx = talloc_zero(mds_ctx, struct mds_es_ctx); + if (mds_es_ctx == NULL) { + return false; + } + *mds_es_ctx = (struct mds_es_ctx) { + .mdssvc_es_ctx = mdssvc_es_ctx, + .mds_ctx = mds_ctx, + }; + + mds_ctx->backend_private = mds_es_ctx; + + subreq = mds_es_connect_send( + mds_es_ctx, + mdssvc_es_ctx->mdssvc_ctx->ev_ctx, + mds_es_ctx); + if (subreq == NULL) { + TALLOC_FREE(mds_es_ctx); + return false; + } + tevent_req_set_callback(subreq, mds_es_connected, mds_es_ctx); + return true; +} + +static void mds_es_connected(struct tevent_req *subreq) +{ + struct mds_es_ctx *mds_es_ctx = tevent_req_callback_data( + subreq, struct mds_es_ctx); + int ret; + bool ok; + + ret = mds_es_connect_recv(subreq); + TALLOC_FREE(subreq); + if (ret != 0) { + DBG_ERR("HTTP connect failed\n"); + return; + } + + ok = mds_es_next_search_trigger(mds_es_ctx); + if (!ok) { + DBG_ERR("mds_es_next_search_trigger failed\n"); + } + return; +} + +struct mds_es_connect_state { + struct tevent_context *ev; + struct mds_es_ctx *mds_es_ctx; + struct tevent_queue_entry *qe; + const char *server_addr; + uint16_t server_port; + struct tstream_tls_params *tls_params; +}; + +static void mds_es_http_connect_done(struct tevent_req *subreq); +static void mds_es_http_waited(struct tevent_req *subreq); + +static struct tevent_req *mds_es_connect_send( + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct mds_es_ctx *mds_es_ctx) +{ + struct tevent_req *req = NULL; + struct tevent_req *subreq = NULL; + struct mds_es_connect_state *state = NULL; + bool use_tls; + NTSTATUS status; + + req = tevent_req_create(mem_ctx, &state, struct mds_es_connect_state); + if (req == NULL) { + return NULL; + } + *state = (struct mds_es_connect_state) { + .ev = ev, + .mds_es_ctx = mds_es_ctx, + }; + + state->server_addr = lp_parm_talloc_string( + state, + mds_es_ctx->mds_ctx->snum, + "elasticsearch", + "address", + "localhost"); + state->server_port = lp_parm_int( + mds_es_ctx->mds_ctx->snum, + "elasticsearch", + "port", + 9200); + + use_tls = lp_parm_bool( + mds_es_ctx->mds_ctx->snum, + "elasticsearch", + "use tls", + false); + + DBG_DEBUG("Connecting to HTTP%s [%s] port [%"PRIu16"]\n", + use_tls ? "S" : "", state->server_addr, state->server_port); + + if (use_tls) { + const char *ca_file = lp__tls_cafile(); + const char *crl_file = lp__tls_crlfile(); + const char *tls_priority = lp_tls_priority(); + enum tls_verify_peer_state verify_peer = lp_tls_verify_peer(); + + status = tstream_tls_params_client(state, + ca_file, + crl_file, + tls_priority, + verify_peer, + state->server_addr, + &state->tls_params); + if (!NT_STATUS_IS_OK(status)) { + DBG_ERR("Failed tstream_tls_params_client - %s\n", + nt_errstr(status)); + tevent_req_nterror(req, status); + return tevent_req_post(req, ev); + } + } + + subreq = http_connect_send(state, + state->ev, + state->server_addr, + state->server_port, + mds_es_ctx->mdssvc_es_ctx->creds, + state->tls_params); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, mds_es_http_connect_done, req); + return req; +} + +static void mds_es_http_connect_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct mds_es_connect_state *state = tevent_req_data( + req, struct mds_es_connect_state); + int error; + + error = http_connect_recv(subreq, + state->mds_es_ctx, + &state->mds_es_ctx->http_conn); + TALLOC_FREE(subreq); + if (error != 0) { + DBG_ERR("HTTP connect failed, retrying...\n"); + + subreq = tevent_wakeup_send( + state->mds_es_ctx, + state->mds_es_ctx->mdssvc_es_ctx->mdssvc_ctx->ev_ctx, + tevent_timeval_current_ofs(10, 0)); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, + mds_es_http_waited, + req); + return; + } + + DBG_DEBUG("Connected to HTTP%s [%s] port [%"PRIu16"]\n", + state->tls_params ? "S" : "", + state->server_addr, state->server_port); + + tevent_req_done(req); + return; +} + +static void mds_es_http_waited(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct mds_es_connect_state *state = tevent_req_data( + req, struct mds_es_connect_state); + bool ok; + + ok = tevent_wakeup_recv(subreq); + TALLOC_FREE(subreq); + if (!ok) { + tevent_req_error(req, ETIMEDOUT); + return; + } + + subreq = mds_es_connect_send( + state->mds_es_ctx, + state->mds_es_ctx->mdssvc_es_ctx->mdssvc_ctx->ev_ctx, + state->mds_es_ctx); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, mds_es_connected, state->mds_es_ctx); +} + +static int mds_es_connect_recv(struct tevent_req *req) +{ + return tevent_req_simple_recv_unix(req); +} + +static void mds_es_reconnect_on_error(struct sl_es_search *s) +{ + struct mds_es_ctx *mds_es_ctx = s->mds_es_ctx; + struct tevent_req *subreq = NULL; + + if (s->slq != NULL) { + s->slq->state = SLQ_STATE_ERROR; + } + + DBG_WARNING("Reconnecting HTTP...\n"); + TALLOC_FREE(mds_es_ctx->http_conn); + + subreq = mds_es_connect_send( + mds_es_ctx, + mds_es_ctx->mdssvc_es_ctx->mdssvc_ctx->ev_ctx, + mds_es_ctx); + if (subreq == NULL) { + DBG_ERR("mds_es_connect_send failed\n"); + return; + } + tevent_req_set_callback(subreq, mds_es_connected, mds_es_ctx); +} + +static int search_destructor(struct sl_es_search *s) +{ + DLIST_REMOVE(s->mds_es_ctx->searches, s); + return 0; +} + +static struct tevent_req *mds_es_search_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct sl_es_search *s); +static int mds_es_search_recv(struct tevent_req *req); +static void mds_es_search_done(struct tevent_req *subreq); + +static bool mds_es_search(struct sl_query *slq) +{ + struct mds_es_ctx *mds_es_ctx = talloc_get_type_abort( + slq->mds_ctx->backend_private, struct mds_es_ctx); + struct sl_es_search *s = NULL; + bool ok; + + s = talloc_zero(slq, struct sl_es_search); + if (s == NULL) { + return false; + } + *s = (struct sl_es_search) { + .ev = mds_es_ctx->mdssvc_es_ctx->mdssvc_ctx->ev_ctx, + .mds_es_ctx = mds_es_ctx, + .slq = slq, + .size = MAX_SL_RESULTS, + }; + + /* 0 would mean no limit */ + s->max = lp_parm_ulonglong(s->slq->mds_ctx->snum, + "elasticsearch", + "max results", + MAX_SL_RESULTS); + + DBG_DEBUG("Spotlight query: '%s'\n", slq->query_string); + + ok = map_spotlight_to_es_query( + s, + mds_es_ctx->mdssvc_es_ctx->mappings, + slq->path_scope, + slq->query_string, + &s->es_query); + if (!ok) { + TALLOC_FREE(s); + return false; + } + DBG_DEBUG("Elasticsearch query: '%s'\n", s->es_query); + + slq->backend_private = s; + slq->state = SLQ_STATE_RUNNING; + DLIST_ADD_END(mds_es_ctx->searches, s); + talloc_set_destructor(s, search_destructor); + + return mds_es_next_search_trigger(mds_es_ctx); +} + +static bool mds_es_next_search_trigger(struct mds_es_ctx *mds_es_ctx) +{ + struct tevent_req *subreq = NULL; + struct sl_es_search *s = mds_es_ctx->searches; + + if (mds_es_ctx->http_conn == NULL) { + DBG_DEBUG("Waiting for HTTP connection...\n"); + return true; + } + if (s == NULL) { + DBG_DEBUG("No pending searches, idling...\n"); + return true; + } + if (s->pending) { + DBG_DEBUG("Search pending [%p]\n", s); + return true; + } + + subreq = mds_es_search_send(s, s->ev, s); + if (subreq == NULL) { + return false; + } + tevent_req_set_callback(subreq, mds_es_search_done, s); + return true; +} + +static void mds_es_search_done(struct tevent_req *subreq) +{ + struct sl_es_search *s = tevent_req_callback_data( + subreq, struct sl_es_search); + struct mds_es_ctx *mds_es_ctx = s->mds_es_ctx; + struct sl_query *slq = s->slq; + int ret; + bool ok; + + DBG_DEBUG("Search done for search [%p]\n", s); + + DLIST_REMOVE(mds_es_ctx->searches, s); + + ret = mds_es_search_recv(subreq); + TALLOC_FREE(subreq); + if (ret != 0) { + mds_es_reconnect_on_error(s); + return; + } + + if (slq == NULL) { + /* + * Closed by the user. This is the only place where we free "s" + * explicitly because the talloc parent slq is already gone. + * Everywhere else we rely on the destructor of slq to free s"." + */ + TALLOC_FREE(s); + goto trigger; + } + + SLQ_DEBUG(10, slq, "search done"); + + if (s->total == 0 || s->from >= s->max) { + slq->state = SLQ_STATE_DONE; + goto trigger; + } + + if (slq->query_results->num_results >= MAX_SL_RESULTS) { + slq->state = SLQ_STATE_FULL; + goto trigger; + } + + /* + * Reschedule this query as there are more results waiting in the + * Elasticsearch server and the client result queue has room as + * well. But put it at the end of the list of active queries as a simple + * heuristic that should ensure all client queries are dispatched to the + * server. + */ + DLIST_ADD_END(mds_es_ctx->searches, s); + +trigger: + ok = mds_es_next_search_trigger(mds_es_ctx); + if (!ok) { + DBG_ERR("mds_es_next_search_trigger failed\n"); + } +} + +static void mds_es_search_http_send_done(struct tevent_req *subreq); +static void mds_es_search_http_read_done(struct tevent_req *subreq); + +struct mds_es_search_state { + struct tevent_context *ev; + struct sl_es_search *s; + struct tevent_queue_entry *qe; + struct http_request http_request; + struct http_request *http_response; +}; + +static int mds_es_search_pending_destructor(struct sl_es_search *s) +{ + /* + * s is a child of slq which may get freed when a user closes a + * query. To maintain the HTTP request/response sequence on the HTTP + * channel, we keep processing pending requests and free s when we + * receive the HTTP response for pending requests. + */ + DBG_DEBUG("Preserving pending search [%p]\n", s); + s->slq = NULL; + return -1; +} + +static void mds_es_search_set_pending(struct sl_es_search *s) +{ + DBG_DEBUG("Set pending [%p]\n", s); + SLQ_DEBUG(10, s->slq, "pending"); + + s->pending = true; + talloc_set_destructor(s, mds_es_search_pending_destructor); +} + +static void mds_es_search_unset_pending(struct sl_es_search *s) +{ + DBG_DEBUG("Unset pending [%p]\n", s); + if (s->slq != NULL) { + SLQ_DEBUG(10, s->slq, "unset pending"); + } + + s->pending = false; + talloc_set_destructor(s, NULL); +} + +static struct tevent_req *mds_es_search_send(TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct sl_es_search *s) +{ + struct tevent_req *req = NULL; + struct tevent_req *subreq = NULL; + struct mds_es_search_state *state = NULL; + const char *index = NULL; + char *elastic_query = NULL; + char *uri = NULL; + size_t elastic_query_len; + char *elastic_query_len_str = NULL; + char *hostname = NULL; + bool pretty = false; + + req = tevent_req_create(mem_ctx, &state, struct mds_es_search_state); + if (req == NULL) { + return NULL; + } + *state = (struct mds_es_search_state) { + .ev = ev, + .s = s, + }; + + if (!tevent_req_set_endtime(req, ev, timeval_current_ofs(60, 0))) { + return tevent_req_post(req, s->ev); + } + + index = lp_parm_const_string(s->slq->mds_ctx->snum, + "elasticsearch", + "index", + "_all"); + if (tevent_req_nomem(index, req)) { + return tevent_req_post(req, ev); + } + + if (DEBUGLVL(10)) { + pretty = true; + } + + uri = talloc_asprintf(state, + "/%s/_search%s", + index, + pretty ? "?pretty" : ""); + if (tevent_req_nomem(uri, req)) { + return tevent_req_post(req, ev); + } + + elastic_query = talloc_asprintf(state, + MDSSVC_ELASTIC_QUERY_TEMPLATE, + s->from, + s->size, + MDSSVC_ELASTIC_SOURCES, + s->es_query); + if (tevent_req_nomem(elastic_query, req)) { + return tevent_req_post(req, ev); + } + DBG_DEBUG("Elastic query: '%s'\n", elastic_query); + + elastic_query_len = strlen(elastic_query); + + state->http_request = (struct http_request) { + .type = HTTP_REQ_POST, + .uri = uri, + .body = data_blob_const(elastic_query, elastic_query_len), + .major = '1', + .minor = '1', + }; + + elastic_query_len_str = talloc_asprintf(state, "%zu", elastic_query_len); + if (tevent_req_nomem(elastic_query_len_str, req)) { + return tevent_req_post(req, ev); + } + + hostname = get_myname(state); + if (tevent_req_nomem(hostname, req)) { + return tevent_req_post(req, ev); + } + + http_add_header(state, &state->http_request.headers, + "Content-Type", "application/json"); + http_add_header(state, &state->http_request.headers, + "Accept", "application/json"); + http_add_header(state, &state->http_request.headers, + "User-Agent", "Samba/mdssvc"); + http_add_header(state, &state->http_request.headers, + "Host", hostname); + http_add_header(state, &state->http_request.headers, + "Content-Length", elastic_query_len_str); + + subreq = http_send_request_send(state, + ev, + s->mds_es_ctx->http_conn, + &state->http_request); + if (tevent_req_nomem(subreq, req)) { + return tevent_req_post(req, ev); + } + mds_es_search_set_pending(s); + tevent_req_set_callback(subreq, mds_es_search_http_send_done, req); + return req; +} + +static void mds_es_search_http_send_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct mds_es_search_state *state = tevent_req_data( + req, struct mds_es_search_state); + NTSTATUS status; + + DBG_DEBUG("Sent out search [%p]\n", state->s); + + status = http_send_request_recv(subreq); + TALLOC_FREE(subreq); + if (!NT_STATUS_IS_OK(status)) { + tevent_req_error(req, map_errno_from_nt_status(status)); + return; + } + + if (state->s->mds_es_ctx->mds_ctx == NULL) { + mds_es_search_unset_pending(state->s); + tevent_req_error(req, ECANCELED); + return; + } + + subreq = http_read_response_send(state, + state->ev, + state->s->mds_es_ctx->http_conn, + MAX_SL_RESULTS * 8192); + if (tevent_req_nomem(subreq, req)) { + return; + } + tevent_req_set_callback(subreq, mds_es_search_http_read_done, req); +} + +static void mds_es_search_http_read_done(struct tevent_req *subreq) +{ + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct mds_es_search_state *state = tevent_req_data( + req, struct mds_es_search_state); + struct sl_es_search *s = state->s; + struct sl_query *slq = s->slq; + json_t *root = NULL; + json_t *matches = NULL; + json_t *match = NULL; + size_t i; + json_error_t error; + int hits; + NTSTATUS status; + int ret; + bool ok; + + DBG_DEBUG("Got response for search [%p]\n", s); + + mds_es_search_unset_pending(s); + + status = http_read_response_recv(subreq, state, &state->http_response); + TALLOC_FREE(subreq); + if (!NT_STATUS_IS_OK(status)) { + DBG_DEBUG("HTTP response failed: %s\n", nt_errstr(status)); + tevent_req_error(req, map_errno_from_nt_status(status)); + return; + } + + if (slq == NULL) { + tevent_req_done(req); + return; + } + if (s->mds_es_ctx->mds_ctx == NULL) { + tevent_req_error(req, ECANCELED); + return; + } + + switch (state->http_response->response_code) { + case 200: + break; + default: + DBG_ERR("HTTP server response: %u\n", + state->http_response->response_code); + goto fail; + } + + DBG_DEBUG("JSON response:\n%s\n", + talloc_strndup(talloc_tos(), + (char *)state->http_response->body.data, + state->http_response->body.length)); + + root = json_loadb((char *)state->http_response->body.data, + state->http_response->body.length, + 0, + &error); + if (root == NULL) { + DBG_ERR("json_loadb failed\n"); + goto fail; + } + + if (s->total == 0) { + /* + * Get the total number of results the first time, format + * used by Elasticsearch 7.0 or newer + */ + ret = json_unpack(root, "{s: {s: {s: i}}}", + "hits", "total", "value", &s->total); + if (ret != 0) { + /* Format used before 7.0 */ + ret = json_unpack(root, "{s: {s: i}}", + "hits", "total", &s->total); + if (ret != 0) { + DBG_ERR("json_unpack failed\n"); + goto fail; + } + } + + DBG_DEBUG("Total: %zu\n", s->total); + + if (s->total == 0) { + json_decref(root); + tevent_req_done(req); + return; + } + } + + if (s->max == 0 || s->max > s->total) { + s->max = s->total; + } + + ret = json_unpack(root, "{s: {s:o}}", + "hits", "hits", &matches); + if (ret != 0 || matches == NULL) { + DBG_ERR("json_unpack hits failed\n"); + goto fail; + } + + hits = json_array_size(matches); + if (hits == 0) { + DBG_ERR("Hu?! No results?\n"); + goto fail; + } + DBG_DEBUG("Hits: %d\n", hits); + + for (i = 0; i < hits; i++) { + const char *path = NULL; + + match = json_array_get(matches, i); + if (match == NULL) { + DBG_ERR("Hu?! No value for index %zu\n", i); + goto fail; + } + ret = json_unpack(match, + "{s: {s: {s: s}}}", + "_source", + "path", + "real", + &path); + if (ret != 0) { + DBG_ERR("Missing path.real in JSON result\n"); + goto fail; + } + + ok = mds_add_result(slq, path); + if (!ok) { + DBG_ERR("error adding result for path: %s\n", path); + goto fail; + } + } + json_decref(root); + + s->from += hits; + slq->state = SLQ_STATE_RESULTS; + tevent_req_done(req); + return; + +fail: + if (root != NULL) { + json_decref(root); + } + slq->state = SLQ_STATE_ERROR; + tevent_req_error(req, EINVAL); + return; +} + +static int mds_es_search_recv(struct tevent_req *req) +{ + return tevent_req_simple_recv_unix(req); +} + +static bool mds_es_search_cont(struct sl_query *slq) +{ + struct sl_es_search *s = talloc_get_type_abort( + slq->backend_private, struct sl_es_search); + + SLQ_DEBUG(10, slq, "continue"); + DLIST_ADD_END(s->mds_es_ctx->searches, s); + return mds_es_next_search_trigger(s->mds_es_ctx); +} + +struct mdssvc_backend mdsscv_backend_es = { + .init = mdssvc_es_init, + .shutdown = mdssvc_es_shutdown, + .connect = mds_es_connect, + .search_start = mds_es_search, + .search_cont = mds_es_search_cont, +}; diff --git a/source3/rpc_server/mdssvc/mdssvc_es.h b/source3/rpc_server/mdssvc/mdssvc_es.h new file mode 100644 index 00000000000..19797fa24f3 --- /dev/null +++ b/source3/rpc_server/mdssvc/mdssvc_es.h @@ -0,0 +1,108 @@ +/* + Unix SMB/CIFS implementation. + Main metadata server / Spotlight routines / HTTP/ES/JSON backend + + Copyright (C) Ralph Boehme 2019 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _MDSSVC_ES_H_ +#define _MDSSVC_ES_H_ + +#include + +/* + * Some global state + */ +struct mdssvc_es_ctx { + struct mdssvc_ctx *mdssvc_ctx; + struct cli_credentials *creds; + json_t *mappings; +}; + +/* + * Per mdssvc RPC bind state + */ +struct mds_es_ctx { + /* + * Pointer to higher level mds_ctx + */ + struct mds_ctx *mds_ctx; + + /* + * Pointer to our global context + */ + struct mdssvc_es_ctx *mdssvc_es_ctx; + + /* + * The HTTP connection handle to the ES server + */ + struct http_conn *http_conn; + + /* + * List of pending searches + */ + struct sl_es_search *searches; +}; + +/* Per search request */ +struct sl_es_search { + /* + * List pointers + */ + struct sl_es_search *prev, *next; + + /* + * Search is being executed. Only the list head can be pending. + */ + bool pending; + + /* + * Shorthand to our tevent context + */ + struct tevent_context *ev; + + /* + * Pointer to the RPC connection ctx the request is using + */ + struct mds_es_ctx *mds_es_ctx; + + /* + * The upper mdssvc.c level query context + */ + struct sl_query *slq; + + /* + * Maximum number of results we process and total number of + * results of a query. + */ + size_t total; + size_t max; + + /* + * For paging results + */ + size_t from; + size_t size; + + /* + * The translated Es query + */ + char *es_query; +}; + +extern struct mdssvc_backend mdsscv_backend_es; + +#endif /* _MDSSVC_ES_H_ */ diff --git a/source3/rpc_server/wscript_build b/source3/rpc_server/wscript_build index 429c51b55cd..870eb3d39d6 100644 --- a/source3/rpc_server/wscript_build +++ b/source3/rpc_server/wscript_build @@ -161,6 +161,18 @@ if bld.env.spotlight_backend_tracker: ''' rpc_mdssvc_deps += 'tevent-glib-glue ' + bld.env['libtracker'] +if bld.env.spotlight_backend_es: + rpc_mdssvc_sources += ''' + mdssvc/mdssvc_es.c + mdssvc/es_mapping.c + mdssvc/es_parser.y + mdssvc/es_lexer.l + ''' + rpc_mdssvc_deps += ' http jansson' + if bld.SAMBA3_IS_ENABLED_MODULE('rpc_mdssvc_module'): + bld.INSTALL_FILES(bld.env.SAMBA_DATADIR, + 'mdssvc/elasticsearch_mappings.json') + bld.SAMBA3_MODULE('rpc_mdssvc_module', subsystem='rpc', allow_undefined_symbols=True, diff --git a/source3/wscript b/source3/wscript index e29bf657b5e..a6092b87615 100644 --- a/source3/wscript +++ b/source3/wscript @@ -1789,6 +1789,13 @@ main() { and conf.CONFIG_GET('HAVE_UTF8_NORMALISATION') ) + with_spotlight_es_backend = ( + conf.CONFIG_SET('HAVE_JSON_OBJECT') + and conf.env['BISON'] + and conf.env['FLEX'] + and conf.CONFIG_GET('HAVE_UTF8_NORMALISATION') + ) + conf.env.with_spotlight = False if Options.options.with_spotlight is not False: backends = ['noindex'] @@ -1804,14 +1811,23 @@ main() { Logs.warn('Missing libtracker-sparql development files for Spotlight backend "tracker"') if not conf.CONFIG_SET('HAVE_GLIB'): Logs.warn('Missing glib-2.0 development files for Spotlight backend "tracker"') + if not conf.CONFIG_GET('HAVE_JSON_OBJECT'): + Logs.warn('Missing libjansson development files for Spotlight backend "elasticsearch"') if with_spotlight_tracker_backend: conf.env.spotlight_backend_tracker = True backends.append('tracker') conf.DEFINE('HAVE_SPOTLIGHT_BACKEND_TRACKER', '1') - if Options.options.with_spotlight is True and not conf.env.spotlight_backend_tracker: - conf.fatal("Unmet dependencies for Spotlight backend") + if with_spotlight_es_backend: + conf.env.spotlight_backend_es = True + backends.append('elasticsearch') + conf.DEFINE('HAVE_SPOTLIGHT_BACKEND_ES', '1') + + if (Options.options.with_spotlight is True + and not conf.env.spotlight_backend_tracker + and not conf.env.spotlight_backend_es): + conf.fatal("Unmet dependencies for Spotlight backends") Logs.info("Building with Spotlight support, available backends: %s" % ', '.join(backends)) default_static_modules.extend(TO_LIST('rpc_mdssvc_module')) diff --git a/source3/wscript_build b/source3/wscript_build index d49512e59e7..7d44e843c49 100644 --- a/source3/wscript_build +++ b/source3/wscript_build @@ -1335,6 +1335,16 @@ bld.SAMBA3_BINARY('spotlight2sparql', enabled=bld.env.spotlight_backend_tracker, install=False) +bld.SAMBA3_BINARY('spotlight2es', + source=''' + rpc_server/mdssvc/es_parser_test.c + rpc_server/mdssvc/es_parser.y + rpc_server/mdssvc/es_lexer.l + rpc_server/mdssvc/es_mapping.c''', + deps='samba3-util talloc jansson smbconf', + enabled=bld.env.spotlight_backend_es, + install=False) + bld.SAMBA3_BINARY('tevent_glib_glue_test', source='lib/tevent_glib_glue_tests.c', deps='''