From ce3d26911e04a98dc8d45486b67d30baf6a5794a Mon Sep 17 00:00:00 2001 From: Sebastian Date: Fri, 16 Oct 2020 20:36:05 +0200 Subject: [PATCH] Elastic search improvements. --- .../Text/Elastic/ElasticSearchMapping.cs | 225 +++++++++++++++++ .../Text/Elastic/ElasticSearchTextIndex.cs | 231 ++---------------- .../Contents/Text/Extensions.cs | 24 +- .../Contents/Text/TextIndexerTestsBase.cs | 17 +- .../Contents/Text/TextIndexerTests_Elastic.cs | 7 +- .../Contents/Text/TextIndexerTests_Mongo.cs | 2 +- 6 files changed, 257 insertions(+), 249 deletions(-) create mode 100644 backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchMapping.cs diff --git a/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchMapping.cs b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchMapping.cs new file mode 100644 index 000000000..f0fabc326 --- /dev/null +++ b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchMapping.cs @@ -0,0 +1,225 @@ +// ========================================================================== +// Squidex Headless CMS +// ========================================================================== +// Copyright (c) Squidex UG (haftungsbeschraenkt) +// All rights reserved. Licensed under the MIT license. +// ========================================================================== + +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using Elasticsearch.Net; + +namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic +{ + public static class ElasticSearchMapping + { + public static async Task ApplyAsync(IElasticLowLevelClient elastic, string indexName, CancellationToken ct = default) + { + var query = new + { + properties = new Dictionary + { + ["texts.ar"] = new + { + type = "text", + analyzer = "arabic" + }, + ["texts.hy"] = new + { + type = "text", + analyzer = "armenian" + }, + ["texts.eu"] = new + { + type = "text", + analyzer = "basque" + }, + ["texts.bn"] = new + { + type = "text", + analyzer = "bengali" + }, + ["texts.br"] = new + { + type = "text", + analyzer = "brazilian" + }, + ["texts.bg"] = new + { + type = "text", + analyzer = "bulgarian" + }, + ["texts.ca"] = new + { + type = "text", + analyzer = "catalan" + }, + ["texts.zh"] = new + { + type = "text", + analyzer = "cjk" + }, + ["texts.ja"] = new + { + type = "text", + analyzer = "cjk" + }, + ["texts.ko"] = new + { + type = "text", + analyzer = "cjk" + }, + ["texts.cs"] = new + { + type = "text", + analyzer = "czech" + }, + ["texts.da"] = new + { + type = "text", + analyzer = "danish" + }, + ["texts.nl"] = new + { + type = "text", + analyzer = "dutch" + }, + ["texts.en"] = new + { + type = "text", + analyzer = "english" + }, + ["texts.fi"] = new + { + type = "text", + analyzer = "finnish" + }, + ["texts.fr"] = new + { + type = "text", + analyzer = "french" + }, + ["texts.gl"] = new + { + type = "text", + analyzer = "galician" + }, + ["texts.de"] = new + { + type = "text", + analyzer = "german" + }, + ["texts.el"] = new + { + type = "text", + analyzer = "greek" + }, + ["texts.hi"] = new + { + type = "text", + analyzer = "hindi" + }, + ["texts.hu"] = new + { + type = "text", + analyzer = "hungarian" + }, + ["texts.id"] = new + { + type = "text", + analyzer = "indonesian" + }, + ["texts.ga"] = new + { + type = "text", + analyzer = "irish" + }, + ["texts.it"] = new + { + type = "text", + analyzer = "italian" + }, + ["texts.lv"] = new + { + type = "text", + analyzer = "latvian" + }, + ["texts.lt"] = new + { + type = "text", + analyzer = "lithuanian" + }, + ["texts.nb"] = new + { + type = "text", + analyzer = "norwegian" + }, + ["texts.nn"] = new + { + type = "text", + analyzer = "norwegian" + }, + ["texts.no"] = new + { + type = "text", + analyzer = "norwegian" + }, + ["texts.pt"] = new + { + type = "text", + analyzer = "portuguese" + }, + ["texts.ro"] = new + { + type = "text", + analyzer = "romanian" + }, + ["texts.ru"] = new + { + type = "text", + analyzer = "russian" + }, + ["texts.ku"] = new + { + type = "text", + analyzer = "sorani" + }, + ["texts.es"] = new + { + type = "text", + analyzer = "spanish" + }, + ["texts.sv"] = new + { + type = "text", + analyzer = "swedish" + }, + ["texts.tr"] = new + { + type = "text", + analyzer = "turkish" + }, + ["texts.th"] = new + { + type = "text", + analyzer = "thai" + } + } + }; + + var result = await elastic.Indices.PutMappingAsync(indexName, CreatePost(query), ctx: ct); + + if (!result.Success) + { + throw new InvalidOperationException($"Failed with ${result.Body}", result.OriginalException); + } + } + + private static PostData CreatePost(T data) + { + return new SerializableData(data); + } + } +} diff --git a/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs index 8774b3761..5ebc40ba3 100644 --- a/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs +++ b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs @@ -38,206 +38,9 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic this.waitForTesting = waitForTesting; } - public async Task InitializeAsync(CancellationToken ct = default) + public Task InitializeAsync(CancellationToken ct = default) { - var query = new - { - properties = new Dictionary - { - ["texts.ar"] = new - { - type = "text", - analyzer = "arabic" - }, - ["texts.hy"] = new - { - type = "text", - analyzer = "armenian" - }, - ["texts.eu"] = new - { - type = "text", - analyzer = "basque" - }, - ["texts.bn"] = new - { - type = "text", - analyzer = "bengali" - }, - ["texts.br"] = new - { - type = "text", - analyzer = "brazilian" - }, - ["texts.bg"] = new - { - type = "text", - analyzer = "bulgarian" - }, - ["texts.ca"] = new - { - type = "text", - analyzer = "catalan" - }, - ["texts.zh"] = new - { - type = "text", - analyzer = "cjk" - }, - ["texts.ja"] = new - { - type = "text", - analyzer = "cjk" - }, - ["texts.ko"] = new - { - type = "text", - analyzer = "cjk" - }, - ["texts.cs"] = new - { - type = "text", - analyzer = "czech" - }, - ["texts.da"] = new - { - type = "text", - analyzer = "danish" - }, - ["texts.nl"] = new - { - type = "text", - analyzer = "dutch" - }, - ["texts.en"] = new - { - type = "text", - analyzer = "english" - }, - ["texts.fi"] = new - { - type = "text", - analyzer = "finnish" - }, - ["texts.fr"] = new - { - type = "text", - analyzer = "french" - }, - ["texts.gl"] = new - { - type = "text", - analyzer = "galician" - }, - ["texts.de"] = new - { - type = "text", - analyzer = "german" - }, - ["texts.el"] = new - { - type = "text", - analyzer = "greek" - }, - ["texts.hi"] = new - { - type = "text", - analyzer = "hindi" - }, - ["texts.hu"] = new - { - type = "text", - analyzer = "hungarian" - }, - ["texts.id"] = new - { - type = "text", - analyzer = "indonesian" - }, - ["texts.ga"] = new - { - type = "text", - analyzer = "irish" - }, - ["texts.it"] = new - { - type = "text", - analyzer = "italian" - }, - ["texts.lv"] = new - { - type = "text", - analyzer = "latvian" - }, - ["texts.lt"] = new - { - type = "text", - analyzer = "lithuanian" - }, - ["texts.nb"] = new - { - type = "text", - analyzer = "norwegian" - }, - ["texts.nn"] = new - { - type = "text", - analyzer = "norwegian" - }, - ["texts.no"] = new - { - type = "text", - analyzer = "norwegian" - }, - ["texts.pt"] = new - { - type = "text", - analyzer = "portuguese" - }, - ["texts.ro"] = new - { - type = "text", - analyzer = "romanian" - }, - ["texts.ru"] = new - { - type = "text", - analyzer = "russian" - }, - ["texts.ku"] = new - { - type = "text", - analyzer = "sorani" - }, - ["texts.es"] = new - { - type = "text", - analyzer = "spanish" - }, - ["texts.sv"] = new - { - type = "text", - analyzer = "swedish" - }, - ["texts.tr"] = new - { - type = "text", - analyzer = "turkish" - }, - ["texts.th"] = new - { - type = "text", - analyzer = "thai" - } - } - }; - - var result = await client.Indices.PutMappingAsync(indexName, CreatePost(query)); - - if (!result.Success) - { - throw new InvalidOperationException($"Failed with ${result.Body}", result.OriginalException); - } + return ElasticSearchMapping.ApplyAsync(client, indexName, ct); } public Task ClearAsync() @@ -315,11 +118,6 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic return client.DeleteAsync(indexName, delete.DocId); } - private static PostData CreatePost(T data) - { - return new SerializableData(data); - } - public async Task?> SearchAsync(string? queryText, IAppEntity app, SearchFilter? filter, SearchScope scope) { if (string.IsNullOrWhiteSpace(queryText)) @@ -327,11 +125,25 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic return new List(); } - var isFuzzy = queryText.StartsWith("~", StringComparison.OrdinalIgnoreCase); + var isFuzzy = queryText.EndsWith("~", StringComparison.OrdinalIgnoreCase); if (isFuzzy) { - queryText = queryText.Substring(1); + queryText = queryText[..^1]; + } + + var field = "texts.*"; + + if (queryText.Length >= 4 && queryText.IndexOf(":", StringComparison.OrdinalIgnoreCase) == 2) + { + var candidateLanguage = queryText.Substring(0, 2); + + if (Language.IsValidLanguage(candidateLanguage)) + { + field = $"texts.{candidateLanguage}"; + + queryText = queryText.Substring(3); + } } var serveField = GetServeField(scope); @@ -365,7 +177,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic fuzziness = isFuzzy ? (object)"AUTO" : 0, fields = new[] { - "texts.*" + field }, query = queryText } @@ -427,5 +239,10 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic "servePublished" : "serveAll"; } + + private static PostData CreatePost(T data) + { + return new SerializableData(data); + } } } diff --git a/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs index a457ebfd5..4adc35151 100644 --- a/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs +++ b/backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs @@ -6,7 +6,6 @@ // ========================================================================== using System.Collections.Generic; -using System.Globalization; using System.Text; using Microsoft.Extensions.ObjectPool; using Squidex.Domain.Apps.Core.Contents; @@ -94,29 +93,8 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text sb.Append(" "); } - foreach (var c in text) - { - if (IsCJKLetter(c)) - { - sb.Append(c); - sb.Append(" "); - } - else - { - sb.Append(c); - } - } + sb.Append(text); } } - - private static bool IsCJKLetter(char c) - { - return char.IsLetter(c) && char.GetUnicodeCategory(c) == UnicodeCategory.OtherLetter && !IsKatakana(c); - } - - private static bool IsKatakana(char c) - { - return c >= '\u30A0' && c <= '\u30FF'; - } } } diff --git a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs index 4ed1cc81d..f7cbc38b3 100644 --- a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs +++ b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs @@ -41,7 +41,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text public virtual bool SupportsCleanup { get; set; } = false; - public virtual bool SupportsSearchSyntax { get; set; } = true; + public virtual bool SupportssQuerySyntax { get; set; } = true; public virtual InMemoryTextIndexerState State { get; } = new InMemoryTextIndexerState(); @@ -53,21 +53,10 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text Language.EN); } - [SkippableFact] - public async Task Should_throw_exception_for_invalid_query() - { - Skip.IfNot(SupportsSearchSyntax); - - await Assert.ThrowsAsync(async () => - { - await TestCombinations(Search(expected: null, text: "~hello")); - }); - } - [SkippableFact] public async Task Should_index_invariant_content_and_retrieve_with_fuzzy() { - Skip.IfNot(SupportsSearchSyntax); + Skip.IfNot(SupportssQuerySyntax); await TestCombinations( Create(ids1[0], "iv", "Hello"), @@ -81,7 +70,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text [SkippableFact] public async Task Should_search_by_field() { - Skip.IfNot(SupportsSearchSyntax); + Skip.IfNot(SupportssQuerySyntax); await TestCombinations( Create(ids1[0], "en", "City"), diff --git a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs index cf1781d09..2d18ca660 100644 --- a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs +++ b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs @@ -38,7 +38,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text public TextIndexerTests_Elastic() { - SupportsSearchSyntax = false; + SupportssQuerySyntax = true; } [Fact] @@ -57,10 +57,9 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text public async Task Should_index_cjk_content_and_retrieve() { await TestCombinations( - Create(ids1[0], "zh", "可以将正向最大匹配方法和"), + Create(ids1[0], "zh", "東京大学"), - Search(expected: ids1, text: "大"), - Search(expected: ids1, text: "匹") + Search(expected: ids1, text: "東京") ); } } diff --git a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs index 2617a2adc..48e916486 100644 --- a/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs +++ b/backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs @@ -44,7 +44,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text public TextIndexerTests_Mongo() { - SupportsSearchSyntax = false; + SupportssQuerySyntax = false; } [Fact]