Browse Source

Improvements to full text

pull/590/head
Sebastian 5 years ago
parent
commit
a958c1bf02
  1. 4
      backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndex.cs
  2. 2
      backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndexEntityText.cs
  3. 225
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs
  4. 24
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs
  5. 2
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/TextIndexingProcess.cs
  6. 90
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs
  7. 32
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs
  8. 18
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs

4
backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndex.cs

@ -131,7 +131,7 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
Filter.Eq(x => x.AppId, app.Id), Filter.Eq(x => x.AppId, app.Id),
Filter.In(x => x.SchemaId, filter.SchemaIds), Filter.In(x => x.SchemaId, filter.SchemaIds),
Filter_ByScope(scope), Filter_ByScope(scope),
Filter.Text(queryText))) Filter.Text(queryText, "none")))
.Only(x => x.ContentId).Limit(limit) .Only(x => x.ContentId).Limit(limit)
.ToListAsync(); .ToListAsync();
@ -146,7 +146,7 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
Filter.Eq(x => x.AppId, app.Id), Filter.Eq(x => x.AppId, app.Id),
Filter.Exists(x => x.SchemaId), Filter.Exists(x => x.SchemaId),
Filter_ByScope(scope), Filter_ByScope(scope),
Filter.Text(queryText))) Filter.Text(queryText, "none")))
.Only(x => x.ContentId).Limit(limit) .Only(x => x.ContentId).Limit(limit)
.ToListAsync(); .ToListAsync();

2
backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndexEntityText.cs

@ -17,6 +17,6 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
[BsonIgnoreIfNull] [BsonIgnoreIfNull]
[BsonElement("language")] [BsonElement("language")]
public string Language { get; set; } public string Language { get; set; } = "none";
} }
} }

225
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs

@ -8,6 +8,8 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis; using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using Elasticsearch.Net; using Elasticsearch.Net;
using Squidex.Domain.Apps.Entities.Apps; using Squidex.Domain.Apps.Entities.Apps;
@ -16,7 +18,7 @@ using Squidex.Infrastructure;
namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{ {
[ExcludeFromCodeCoverage] [ExcludeFromCodeCoverage]
public sealed class ElasticSearchTextIndex : ITextIndex public sealed class ElasticSearchTextIndex : ITextIndex, IInitializable
{ {
private readonly ElasticLowLevelClient client; private readonly ElasticLowLevelClient client;
private readonly string indexName; private readonly string indexName;
@ -36,6 +38,208 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
this.waitForTesting = waitForTesting; this.waitForTesting = waitForTesting;
} }
public async Task InitializeAsync(CancellationToken ct = default)
{
var query = new
{
properties = new Dictionary<string, object>
{
["texts.ar"] = new
{
type = "text",
analyzer = "arabic"
},
["texts.hy"] = new
{
type = "text",
analyzer = "armenian"
},
["texts.eu"] = new
{
type = "text",
analyzer = "basque"
},
["texts.bn"] = new
{
type = "text",
analyzer = "bengali"
},
["texts.br"] = new
{
type = "text",
analyzer = "brazilian"
},
["texts.bg"] = new
{
type = "text",
analyzer = "bulgarian"
},
["texts.ca"] = new
{
type = "text",
analyzer = "catalan"
},
["texts.zh"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.ja"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.ko"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.cs"] = new
{
type = "text",
analyzer = "czech"
},
["texts.da"] = new
{
type = "text",
analyzer = "danish"
},
["texts.nl"] = new
{
type = "text",
analyzer = "dutch"
},
["texts.en"] = new
{
type = "text",
analyzer = "english"
},
["texts.fi"] = new
{
type = "text",
analyzer = "finnish"
},
["texts.fr"] = new
{
type = "text",
analyzer = "french"
},
["texts.gl"] = new
{
type = "text",
analyzer = "galician"
},
["texts.de"] = new
{
type = "text",
analyzer = "german"
},
["texts.el"] = new
{
type = "text",
analyzer = "greek"
},
["texts.hi"] = new
{
type = "text",
analyzer = "hindi"
},
["texts.hu"] = new
{
type = "text",
analyzer = "hungarian"
},
["texts.id"] = new
{
type = "text",
analyzer = "indonesian"
},
["texts.ga"] = new
{
type = "text",
analyzer = "irish"
},
["texts.it"] = new
{
type = "text",
analyzer = "italian"
},
["texts.lv"] = new
{
type = "text",
analyzer = "latvian"
},
["texts.lt"] = new
{
type = "text",
analyzer = "lithuanian"
},
["texts.nb"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.nn"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.no"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.pt"] = new
{
type = "text",
analyzer = "portuguese"
},
["texts.ro"] = new
{
type = "text",
analyzer = "romanian"
},
["texts.ru"] = new
{
type = "text",
analyzer = "russian"
},
["texts.ku"] = new
{
type = "text",
analyzer = "sorani"
},
["texts.es"] = new
{
type = "text",
analyzer = "spanish"
},
["texts.sv"] = new
{
type = "text",
analyzer = "swedish"
},
["texts.tr"] = new
{
type = "text",
analyzer = "turkish"
},
["texts.th"] = new
{
type = "text",
analyzer = "thai"
}
}
};
var result = await client.Indices.PutMappingAsync<StringResponse>(indexName, CreatePost(query));
if (!result.Success)
{
throw new InvalidOperationException($"Failed with ${result.Body}", result.OriginalException);
}
}
public Task ClearAsync() public Task ClearAsync()
{ {
return Task.CompletedTask; return Task.CompletedTask;
@ -118,6 +322,18 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
public async Task<List<DomainId>?> SearchAsync(string? queryText, IAppEntity app, SearchFilter? filter, SearchScope scope) public async Task<List<DomainId>?> SearchAsync(string? queryText, IAppEntity app, SearchFilter? filter, SearchScope scope)
{ {
if (string.IsNullOrWhiteSpace(queryText))
{
return new List<DomainId>();
}
var isFuzzy = queryText.StartsWith("~", StringComparison.OrdinalIgnoreCase);
if (isFuzzy)
{
queryText = queryText.Substring(1);
}
var serveField = GetServeField(scope); var serveField = GetServeField(scope);
var query = new var query = new
@ -132,7 +348,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{ {
term = new Dictionary<string, object> term = new Dictionary<string, object>
{ {
["appId.keyword"] = app.Id ["appId.keyword"] = app.Id.ToString()
} }
}, },
new new
@ -146,6 +362,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{ {
multi_match = new multi_match = new
{ {
fuzziness = isFuzzy ? (object)"AUTO" : 0,
fields = new[] fields = new[]
{ {
"texts.*" "texts.*"
@ -170,7 +387,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{ {
terms = new Dictionary<string, object> terms = new Dictionary<string, object>
{ {
["schemaId.keyword"] = filter.SchemaIds ["schemaId.keyword"] = filter.SchemaIds.Select(x => x.ToString()).ToArray()
} }
}; };
@ -197,7 +414,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{ {
if (item != null) if (item != null)
{ {
ids.Add(item["_source"]["contentId"]); ids.Add(DomainId.Create(item["_source"]["contentId"]));
} }
} }

24
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs

@ -6,6 +6,7 @@
// ========================================================================== // ==========================================================================
using System.Collections.Generic; using System.Collections.Generic;
using System.Globalization;
using System.Text; using System.Text;
using Microsoft.Extensions.ObjectPool; using Microsoft.Extensions.ObjectPool;
using Squidex.Domain.Apps.Core.Contents; using Squidex.Domain.Apps.Core.Contents;
@ -93,8 +94,29 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
sb.Append(" "); sb.Append(" ");
} }
sb.Append(text); foreach (var c in text)
{
if (IsCJKLetter(c))
{
sb.Append(c);
sb.Append(" ");
}
else
{
sb.Append(c);
}
}
} }
} }
private static bool IsCJKLetter(char c)
{
return char.IsLetter(c) && char.GetUnicodeCategory(c) == UnicodeCategory.OtherLetter && !IsKatakana(c);
}
private static bool IsKatakana(char c)
{
return c >= '\u30A0' && c <= '\u30FF';
}
} }
} }

2
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/TextIndexingProcess.cs

@ -355,7 +355,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
var ids = var ids =
events events
.Select(x => x.Payload).OfType<ContentEvent>() .Select(x => x.Payload).OfType<ContentEvent>()
.Select(x => x.ContentId) .Select(x => DomainId.Combine(x.AppId.Id, x.ContentId))
.ToHashSet(); .ToHashSet();
return textIndexerState.GetAsync(ids); return textIndexerState.GetAsync(ids);

90
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs

@ -8,6 +8,7 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Threading.Tasks; using System.Threading.Tasks;
using FluentAssertions;
using Squidex.Domain.Apps.Core.Contents; using Squidex.Domain.Apps.Core.Contents;
using Squidex.Domain.Apps.Entities.Apps; using Squidex.Domain.Apps.Entities.Apps;
using Squidex.Domain.Apps.Entities.Contents.Text.State; using Squidex.Domain.Apps.Entities.Contents.Text.State;
@ -18,6 +19,7 @@ using Squidex.Infrastructure.EventSourcing;
using Squidex.Infrastructure.Validation; using Squidex.Infrastructure.Validation;
using Xunit; using Xunit;
#pragma warning disable SA1401 // Fields should be private
#pragma warning disable SA1114 // Parameter list should follow declaration #pragma warning disable SA1114 // Parameter list should follow declaration
#pragma warning disable SA1115 // Parameter should follow comma #pragma warning disable SA1115 // Parameter should follow comma
#pragma warning disable RECS0021 // Warns about calls to virtual member functions occuring in the constructor #pragma warning disable RECS0021 // Warns about calls to virtual member functions occuring in the constructor
@ -26,13 +28,14 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
{ {
public abstract class TextIndexerTestsBase public abstract class TextIndexerTestsBase
{ {
private readonly List<DomainId> ids1 = new List<DomainId> { DomainId.NewGuid() }; protected readonly List<DomainId> ids1 = new List<DomainId> { DomainId.NewGuid() };
private readonly List<DomainId> ids2 = new List<DomainId> { DomainId.NewGuid() }; protected readonly List<DomainId> ids2 = new List<DomainId> { DomainId.NewGuid() };
private readonly NamedId<DomainId> appId = NamedId.Of(DomainId.NewGuid(), "my-app"); private readonly NamedId<DomainId> appId = NamedId.Of(DomainId.NewGuid(), "my-app");
private readonly NamedId<DomainId> schemaId = NamedId.Of(DomainId.NewGuid(), "my-schema"); private readonly NamedId<DomainId> schemaId = NamedId.Of(DomainId.NewGuid(), "my-schema");
private readonly IAppEntity app; private readonly IAppEntity app;
private delegate Task IndexOperation(TextIndexingProcess process); protected delegate Task IndexOperation(TextIndexingProcess process);
public abstract IIndexerFactory Factory { get; } public abstract IIndexerFactory Factory { get; }
@ -40,14 +43,12 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public virtual bool SupportsSearchSyntax { get; set; } = true; public virtual bool SupportsSearchSyntax { get; set; } = true;
public virtual bool SupportsMultiLanguage { get; set; } = true;
public virtual InMemoryTextIndexerState State { get; } = new InMemoryTextIndexerState(); public virtual InMemoryTextIndexerState State { get; } = new InMemoryTextIndexerState();
protected TextIndexerTestsBase() protected TextIndexerTestsBase()
{ {
app = app =
Mocks.App(NamedId.Of(DomainId.NewGuid(), "my-app"), Mocks.App(appId,
Language.DE, Language.DE,
Language.EN); Language.EN);
} }
@ -91,41 +92,6 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
); );
} }
[Fact]
public async Task Should_index_localized_content_and_retrieve()
{
if (SupportsMultiLanguage)
{
await TestCombinations(
Create(ids1[0], "de", "Stadt und Land and Fluss"),
Create(ids2[0], "en", "City and Country und River"),
Search(expected: ids1, text: "Stadt"),
Search(expected: ids2, text: "City"),
Search(expected: ids1, text: "and"),
Search(expected: ids2, text: "und")
);
}
else
{
var both = ids2.Union(ids1).ToList();
await TestCombinations(
Create(ids1[0], "de", "Stadt und Land and Fluss"),
Create(ids2[0], "en", "City and Country und River"),
Search(expected: ids1, text: "Stadt"),
Search(expected: ids2, text: "City"),
Search(expected: null, text: "and"),
Search(expected: both, text: "und")
);
}
}
[Fact] [Fact]
public async Task Should_index_invariant_content_and_retrieve() public async Task Should_index_invariant_content_and_retrieve()
{ {
@ -316,10 +282,10 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
Search(expected: null, text: "V2", target: SearchScope.Published), Search(expected: null, text: "V2", target: SearchScope.Published),
// Make an update, this updates the current version only. // Make an update, this updates the current version only.
Update(ids1[0], "iv", "Night"), Update(ids1[0], "iv", "V3"),
Search(expected: ids1, text: "Night", target: SearchScope.All), Search(expected: ids1, text: "V3", target: SearchScope.All),
Search(expected: ids1, text: "Night", target: SearchScope.Published) Search(expected: ids1, text: "V3", target: SearchScope.Published)
); );
} }
@ -327,20 +293,20 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public async Task Should_delete_documents_from_index() public async Task Should_delete_documents_from_index()
{ {
await TestCombinations( await TestCombinations(
Create(ids1[0], "iv", "Hello"), Create(ids1[0], "iv", "V1_1"),
Create(ids2[0], "iv", "World"), Create(ids2[0], "iv", "V2_1"),
Search(expected: ids1, text: "Hello"), Search(expected: ids1, text: "V1_1"),
Search(expected: ids2, text: "World"), Search(expected: ids2, text: "V2_1"),
Delete(ids1[0]), Delete(ids1[0]),
Search(expected: null, text: "Hello"), Search(expected: null, text: "V1_1"),
Search(expected: ids2, text: "World") Search(expected: ids2, text: "V2_1")
); );
} }
private IndexOperation Create(DomainId id, string language, string text) protected IndexOperation Create(DomainId id, string language, string text)
{ {
var data = var data =
new NamedContentData() new NamedContentData()
@ -351,7 +317,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentCreated { Data = data }); return Op(id, new ContentCreated { Data = data });
} }
private IndexOperation Update(DomainId id, string language, string text) protected IndexOperation Update(DomainId id, string language, string text)
{ {
var data = var data =
new NamedContentData() new NamedContentData()
@ -362,7 +328,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentUpdated { Data = data }); return Op(id, new ContentUpdated { Data = data });
} }
private IndexOperation CreateDraftWithData(DomainId id, string language, string text) protected IndexOperation CreateDraftWithData(DomainId id, string language, string text)
{ {
var data = var data =
new NamedContentData() new NamedContentData()
@ -373,27 +339,27 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentDraftCreated { MigratedData = data }); return Op(id, new ContentDraftCreated { MigratedData = data });
} }
private IndexOperation CreateDraft(DomainId id) protected IndexOperation CreateDraft(DomainId id)
{ {
return Op(id, new ContentDraftCreated()); return Op(id, new ContentDraftCreated());
} }
private IndexOperation Publish(DomainId id) protected IndexOperation Publish(DomainId id)
{ {
return Op(id, new ContentStatusChanged { Status = Status.Published }); return Op(id, new ContentStatusChanged { Status = Status.Published });
} }
private IndexOperation Unpublish( DomainId id) protected IndexOperation Unpublish( DomainId id)
{ {
return Op(id, new ContentStatusChanged { Status = Status.Draft }); return Op(id, new ContentStatusChanged { Status = Status.Draft });
} }
private IndexOperation DeleteDraft(DomainId id) protected IndexOperation DeleteDraft(DomainId id)
{ {
return Op(id, new ContentDraftDeleted()); return Op(id, new ContentDraftDeleted());
} }
private IndexOperation Delete(DomainId id) protected IndexOperation Delete(DomainId id)
{ {
return Op(id, new ContentDeleted()); return Op(id, new ContentDeleted());
} }
@ -407,7 +373,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return p => p.On(Enumerable.Repeat(Envelope.Create<IEvent>(contentEvent), 1)); return p => p.On(Enumerable.Repeat(Envelope.Create<IEvent>(contentEvent), 1));
} }
private IndexOperation Search(List<DomainId>? expected, string text, SearchScope target = SearchScope.All) protected IndexOperation Search(List<DomainId>? expected, string text, SearchScope target = SearchScope.All)
{ {
return async p => return async p =>
{ {
@ -417,7 +383,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
if (expected != null) if (expected != null)
{ {
Assert.Equal(expected, result); result.Should().BeEquivalentTo(expected.ToHashSet());
} }
else else
{ {
@ -426,7 +392,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
}; };
} }
private async Task TestCombinations(params IndexOperation[] actions) protected async Task TestCombinations(params IndexOperation[] actions)
{ {
if (SupportsCleanup) if (SupportsCleanup)
{ {
@ -441,7 +407,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
} }
} }
private async Task TestCombinations(int firstSteps, params IndexOperation[] actions) protected async Task TestCombinations(int firstSteps, params IndexOperation[] actions)
{ {
await ExecuteAsync(async sut => await ExecuteAsync(async sut =>
{ {

32
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs

@ -10,6 +10,8 @@ using Squidex.Domain.Apps.Entities.Contents.Text.Elastic;
using Squidex.Infrastructure; using Squidex.Infrastructure;
using Xunit; using Xunit;
#pragma warning disable SA1115 // Parameter should follow comma
namespace Squidex.Domain.Apps.Entities.Contents.Text namespace Squidex.Domain.Apps.Entities.Contents.Text
{ {
[Trait("Category", "Dependencies")] [Trait("Category", "Dependencies")]
@ -22,11 +24,13 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Task.CompletedTask; return Task.CompletedTask;
} }
public Task<ITextIndex> CreateAsync(DomainId schemaId) public async Task<ITextIndex> CreateAsync(DomainId schemaId)
{ {
var index = new ElasticSearchTextIndex("http://localhost:9200", "squidex", true); var index = new ElasticSearchTextIndex("http://localhost:9200", "squidex", true);
return Task.FromResult<ITextIndex>(index); await index.InitializeAsync();
return index;
} }
} }
@ -35,7 +39,29 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public TextIndexerTests_Elastic() public TextIndexerTests_Elastic()
{ {
SupportsSearchSyntax = false; SupportsSearchSyntax = false;
SupportsMultiLanguage = false; }
[Fact]
public async Task Should_index_localized_content_without_stop_words_and_retrieve()
{
await TestCombinations(
Create(ids1[0], "de", "and und"),
Create(ids2[0], "en", "and und"),
Search(expected: ids1, text: "and"),
Search(expected: ids2, text: "und")
);
}
[Fact]
public async Task Should_index_cjk_content_and_retrieve()
{
await TestCombinations(
Create(ids1[0], "zh", "可以将正向最大匹配方法和"),
Search(expected: ids1, text: "大"),
Search(expected: ids1, text: "匹")
);
} }
} }
} }

18
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs

@ -5,12 +5,15 @@
// All rights reserved. Licensed under the MIT license. // All rights reserved. Licensed under the MIT license.
// ========================================================================== // ==========================================================================
using System.Linq;
using System.Threading.Tasks; using System.Threading.Tasks;
using MongoDB.Driver; using MongoDB.Driver;
using Squidex.Domain.Apps.Entities.MongoDb.FullText; using Squidex.Domain.Apps.Entities.MongoDb.FullText;
using Squidex.Infrastructure; using Squidex.Infrastructure;
using Xunit; using Xunit;
#pragma warning disable SA1115 // Parameter should follow comma
namespace Squidex.Domain.Apps.Entities.Contents.Text namespace Squidex.Domain.Apps.Entities.Contents.Text
{ {
[Trait("Category", "Dependencies")] [Trait("Category", "Dependencies")]
@ -42,7 +45,20 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public TextIndexerTests_Mongo() public TextIndexerTests_Mongo()
{ {
SupportsSearchSyntax = false; SupportsSearchSyntax = false;
SupportsMultiLanguage = false; }
[Fact]
public async Task Should_index_localized_content_without_stop_words_and_retrieve()
{
var both = ids2.Union(ids1).ToList();
await TestCombinations(
Create(ids1[0], "de", "and und"),
Create(ids2[0], "en", "and und"),
Search(expected: both, text: "and"),
Search(expected: both, text: "und")
);
} }
} }
} }
Loading…
Cancel
Save