Browse Source

Improvements to full text

pull/590/head
Sebastian 5 years ago
parent
commit
a958c1bf02
  1. 4
      backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndex.cs
  2. 2
      backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndexEntityText.cs
  3. 225
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs
  4. 24
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs
  5. 2
      backend/src/Squidex.Domain.Apps.Entities/Contents/Text/TextIndexingProcess.cs
  6. 90
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs
  7. 32
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs
  8. 18
      backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs

4
backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndex.cs

@ -131,7 +131,7 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
Filter.Eq(x => x.AppId, app.Id),
Filter.In(x => x.SchemaId, filter.SchemaIds),
Filter_ByScope(scope),
Filter.Text(queryText)))
Filter.Text(queryText, "none")))
.Only(x => x.ContentId).Limit(limit)
.ToListAsync();
@ -146,7 +146,7 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
Filter.Eq(x => x.AppId, app.Id),
Filter.Exists(x => x.SchemaId),
Filter_ByScope(scope),
Filter.Text(queryText)))
Filter.Text(queryText, "none")))
.Only(x => x.ContentId).Limit(limit)
.ToListAsync();

2
backend/src/Squidex.Domain.Apps.Entities.MongoDb/FullText/MongoTextIndexEntityText.cs

@ -17,6 +17,6 @@ namespace Squidex.Domain.Apps.Entities.MongoDb.FullText
[BsonIgnoreIfNull]
[BsonElement("language")]
public string Language { get; set; }
public string Language { get; set; } = "none";
}
}

225
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Elastic/ElasticSearchTextIndex.cs

@ -8,6 +8,8 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Elasticsearch.Net;
using Squidex.Domain.Apps.Entities.Apps;
@ -16,7 +18,7 @@ using Squidex.Infrastructure;
namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{
[ExcludeFromCodeCoverage]
public sealed class ElasticSearchTextIndex : ITextIndex
public sealed class ElasticSearchTextIndex : ITextIndex, IInitializable
{
private readonly ElasticLowLevelClient client;
private readonly string indexName;
@ -36,6 +38,208 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
this.waitForTesting = waitForTesting;
}
public async Task InitializeAsync(CancellationToken ct = default)
{
var query = new
{
properties = new Dictionary<string, object>
{
["texts.ar"] = new
{
type = "text",
analyzer = "arabic"
},
["texts.hy"] = new
{
type = "text",
analyzer = "armenian"
},
["texts.eu"] = new
{
type = "text",
analyzer = "basque"
},
["texts.bn"] = new
{
type = "text",
analyzer = "bengali"
},
["texts.br"] = new
{
type = "text",
analyzer = "brazilian"
},
["texts.bg"] = new
{
type = "text",
analyzer = "bulgarian"
},
["texts.ca"] = new
{
type = "text",
analyzer = "catalan"
},
["texts.zh"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.ja"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.ko"] = new
{
type = "text",
analyzer = "cjk"
},
["texts.cs"] = new
{
type = "text",
analyzer = "czech"
},
["texts.da"] = new
{
type = "text",
analyzer = "danish"
},
["texts.nl"] = new
{
type = "text",
analyzer = "dutch"
},
["texts.en"] = new
{
type = "text",
analyzer = "english"
},
["texts.fi"] = new
{
type = "text",
analyzer = "finnish"
},
["texts.fr"] = new
{
type = "text",
analyzer = "french"
},
["texts.gl"] = new
{
type = "text",
analyzer = "galician"
},
["texts.de"] = new
{
type = "text",
analyzer = "german"
},
["texts.el"] = new
{
type = "text",
analyzer = "greek"
},
["texts.hi"] = new
{
type = "text",
analyzer = "hindi"
},
["texts.hu"] = new
{
type = "text",
analyzer = "hungarian"
},
["texts.id"] = new
{
type = "text",
analyzer = "indonesian"
},
["texts.ga"] = new
{
type = "text",
analyzer = "irish"
},
["texts.it"] = new
{
type = "text",
analyzer = "italian"
},
["texts.lv"] = new
{
type = "text",
analyzer = "latvian"
},
["texts.lt"] = new
{
type = "text",
analyzer = "lithuanian"
},
["texts.nb"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.nn"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.no"] = new
{
type = "text",
analyzer = "norwegian"
},
["texts.pt"] = new
{
type = "text",
analyzer = "portuguese"
},
["texts.ro"] = new
{
type = "text",
analyzer = "romanian"
},
["texts.ru"] = new
{
type = "text",
analyzer = "russian"
},
["texts.ku"] = new
{
type = "text",
analyzer = "sorani"
},
["texts.es"] = new
{
type = "text",
analyzer = "spanish"
},
["texts.sv"] = new
{
type = "text",
analyzer = "swedish"
},
["texts.tr"] = new
{
type = "text",
analyzer = "turkish"
},
["texts.th"] = new
{
type = "text",
analyzer = "thai"
}
}
};
var result = await client.Indices.PutMappingAsync<StringResponse>(indexName, CreatePost(query));
if (!result.Success)
{
throw new InvalidOperationException($"Failed with ${result.Body}", result.OriginalException);
}
}
public Task ClearAsync()
{
return Task.CompletedTask;
@ -118,6 +322,18 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
public async Task<List<DomainId>?> SearchAsync(string? queryText, IAppEntity app, SearchFilter? filter, SearchScope scope)
{
if (string.IsNullOrWhiteSpace(queryText))
{
return new List<DomainId>();
}
var isFuzzy = queryText.StartsWith("~", StringComparison.OrdinalIgnoreCase);
if (isFuzzy)
{
queryText = queryText.Substring(1);
}
var serveField = GetServeField(scope);
var query = new
@ -132,7 +348,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{
term = new Dictionary<string, object>
{
["appId.keyword"] = app.Id
["appId.keyword"] = app.Id.ToString()
}
},
new
@ -146,6 +362,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{
multi_match = new
{
fuzziness = isFuzzy ? (object)"AUTO" : 0,
fields = new[]
{
"texts.*"
@ -170,7 +387,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{
terms = new Dictionary<string, object>
{
["schemaId.keyword"] = filter.SchemaIds
["schemaId.keyword"] = filter.SchemaIds.Select(x => x.ToString()).ToArray()
}
};
@ -197,7 +414,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text.Elastic
{
if (item != null)
{
ids.Add(item["_source"]["contentId"]);
ids.Add(DomainId.Create(item["_source"]["contentId"]));
}
}

24
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/Extensions.cs

@ -6,6 +6,7 @@
// ==========================================================================
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using Microsoft.Extensions.ObjectPool;
using Squidex.Domain.Apps.Core.Contents;
@ -93,8 +94,29 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
sb.Append(" ");
}
sb.Append(text);
foreach (var c in text)
{
if (IsCJKLetter(c))
{
sb.Append(c);
sb.Append(" ");
}
else
{
sb.Append(c);
}
}
}
}
private static bool IsCJKLetter(char c)
{
return char.IsLetter(c) && char.GetUnicodeCategory(c) == UnicodeCategory.OtherLetter && !IsKatakana(c);
}
private static bool IsKatakana(char c)
{
return c >= '\u30A0' && c <= '\u30FF';
}
}
}

2
backend/src/Squidex.Domain.Apps.Entities/Contents/Text/TextIndexingProcess.cs

@ -355,7 +355,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
var ids =
events
.Select(x => x.Payload).OfType<ContentEvent>()
.Select(x => x.ContentId)
.Select(x => DomainId.Combine(x.AppId.Id, x.ContentId))
.ToHashSet();
return textIndexerState.GetAsync(ids);

90
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTestsBase.cs

@ -8,6 +8,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using FluentAssertions;
using Squidex.Domain.Apps.Core.Contents;
using Squidex.Domain.Apps.Entities.Apps;
using Squidex.Domain.Apps.Entities.Contents.Text.State;
@ -18,6 +19,7 @@ using Squidex.Infrastructure.EventSourcing;
using Squidex.Infrastructure.Validation;
using Xunit;
#pragma warning disable SA1401 // Fields should be private
#pragma warning disable SA1114 // Parameter list should follow declaration
#pragma warning disable SA1115 // Parameter should follow comma
#pragma warning disable RECS0021 // Warns about calls to virtual member functions occuring in the constructor
@ -26,13 +28,14 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
{
public abstract class TextIndexerTestsBase
{
private readonly List<DomainId> ids1 = new List<DomainId> { DomainId.NewGuid() };
private readonly List<DomainId> ids2 = new List<DomainId> { DomainId.NewGuid() };
protected readonly List<DomainId> ids1 = new List<DomainId> { DomainId.NewGuid() };
protected readonly List<DomainId> ids2 = new List<DomainId> { DomainId.NewGuid() };
private readonly NamedId<DomainId> appId = NamedId.Of(DomainId.NewGuid(), "my-app");
private readonly NamedId<DomainId> schemaId = NamedId.Of(DomainId.NewGuid(), "my-schema");
private readonly IAppEntity app;
private delegate Task IndexOperation(TextIndexingProcess process);
protected delegate Task IndexOperation(TextIndexingProcess process);
public abstract IIndexerFactory Factory { get; }
@ -40,14 +43,12 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public virtual bool SupportsSearchSyntax { get; set; } = true;
public virtual bool SupportsMultiLanguage { get; set; } = true;
public virtual InMemoryTextIndexerState State { get; } = new InMemoryTextIndexerState();
protected TextIndexerTestsBase()
{
app =
Mocks.App(NamedId.Of(DomainId.NewGuid(), "my-app"),
Mocks.App(appId,
Language.DE,
Language.EN);
}
@ -91,41 +92,6 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
);
}
[Fact]
public async Task Should_index_localized_content_and_retrieve()
{
if (SupportsMultiLanguage)
{
await TestCombinations(
Create(ids1[0], "de", "Stadt und Land and Fluss"),
Create(ids2[0], "en", "City and Country und River"),
Search(expected: ids1, text: "Stadt"),
Search(expected: ids2, text: "City"),
Search(expected: ids1, text: "and"),
Search(expected: ids2, text: "und")
);
}
else
{
var both = ids2.Union(ids1).ToList();
await TestCombinations(
Create(ids1[0], "de", "Stadt und Land and Fluss"),
Create(ids2[0], "en", "City and Country und River"),
Search(expected: ids1, text: "Stadt"),
Search(expected: ids2, text: "City"),
Search(expected: null, text: "and"),
Search(expected: both, text: "und")
);
}
}
[Fact]
public async Task Should_index_invariant_content_and_retrieve()
{
@ -316,10 +282,10 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
Search(expected: null, text: "V2", target: SearchScope.Published),
// Make an update, this updates the current version only.
Update(ids1[0], "iv", "Night"),
Update(ids1[0], "iv", "V3"),
Search(expected: ids1, text: "Night", target: SearchScope.All),
Search(expected: ids1, text: "Night", target: SearchScope.Published)
Search(expected: ids1, text: "V3", target: SearchScope.All),
Search(expected: ids1, text: "V3", target: SearchScope.Published)
);
}
@ -327,20 +293,20 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public async Task Should_delete_documents_from_index()
{
await TestCombinations(
Create(ids1[0], "iv", "Hello"),
Create(ids2[0], "iv", "World"),
Create(ids1[0], "iv", "V1_1"),
Create(ids2[0], "iv", "V2_1"),
Search(expected: ids1, text: "Hello"),
Search(expected: ids2, text: "World"),
Search(expected: ids1, text: "V1_1"),
Search(expected: ids2, text: "V2_1"),
Delete(ids1[0]),
Search(expected: null, text: "Hello"),
Search(expected: ids2, text: "World")
Search(expected: null, text: "V1_1"),
Search(expected: ids2, text: "V2_1")
);
}
private IndexOperation Create(DomainId id, string language, string text)
protected IndexOperation Create(DomainId id, string language, string text)
{
var data =
new NamedContentData()
@ -351,7 +317,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentCreated { Data = data });
}
private IndexOperation Update(DomainId id, string language, string text)
protected IndexOperation Update(DomainId id, string language, string text)
{
var data =
new NamedContentData()
@ -362,7 +328,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentUpdated { Data = data });
}
private IndexOperation CreateDraftWithData(DomainId id, string language, string text)
protected IndexOperation CreateDraftWithData(DomainId id, string language, string text)
{
var data =
new NamedContentData()
@ -373,27 +339,27 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Op(id, new ContentDraftCreated { MigratedData = data });
}
private IndexOperation CreateDraft(DomainId id)
protected IndexOperation CreateDraft(DomainId id)
{
return Op(id, new ContentDraftCreated());
}
private IndexOperation Publish(DomainId id)
protected IndexOperation Publish(DomainId id)
{
return Op(id, new ContentStatusChanged { Status = Status.Published });
}
private IndexOperation Unpublish( DomainId id)
protected IndexOperation Unpublish( DomainId id)
{
return Op(id, new ContentStatusChanged { Status = Status.Draft });
}
private IndexOperation DeleteDraft(DomainId id)
protected IndexOperation DeleteDraft(DomainId id)
{
return Op(id, new ContentDraftDeleted());
}
private IndexOperation Delete(DomainId id)
protected IndexOperation Delete(DomainId id)
{
return Op(id, new ContentDeleted());
}
@ -407,7 +373,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return p => p.On(Enumerable.Repeat(Envelope.Create<IEvent>(contentEvent), 1));
}
private IndexOperation Search(List<DomainId>? expected, string text, SearchScope target = SearchScope.All)
protected IndexOperation Search(List<DomainId>? expected, string text, SearchScope target = SearchScope.All)
{
return async p =>
{
@ -417,7 +383,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
if (expected != null)
{
Assert.Equal(expected, result);
result.Should().BeEquivalentTo(expected.ToHashSet());
}
else
{
@ -426,7 +392,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
};
}
private async Task TestCombinations(params IndexOperation[] actions)
protected async Task TestCombinations(params IndexOperation[] actions)
{
if (SupportsCleanup)
{
@ -441,7 +407,7 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
}
}
private async Task TestCombinations(int firstSteps, params IndexOperation[] actions)
protected async Task TestCombinations(int firstSteps, params IndexOperation[] actions)
{
await ExecuteAsync(async sut =>
{

32
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Elastic.cs

@ -10,6 +10,8 @@ using Squidex.Domain.Apps.Entities.Contents.Text.Elastic;
using Squidex.Infrastructure;
using Xunit;
#pragma warning disable SA1115 // Parameter should follow comma
namespace Squidex.Domain.Apps.Entities.Contents.Text
{
[Trait("Category", "Dependencies")]
@ -22,11 +24,13 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
return Task.CompletedTask;
}
public Task<ITextIndex> CreateAsync(DomainId schemaId)
public async Task<ITextIndex> CreateAsync(DomainId schemaId)
{
var index = new ElasticSearchTextIndex("http://localhost:9200", "squidex", true);
return Task.FromResult<ITextIndex>(index);
await index.InitializeAsync();
return index;
}
}
@ -35,7 +39,29 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public TextIndexerTests_Elastic()
{
SupportsSearchSyntax = false;
SupportsMultiLanguage = false;
}
[Fact]
public async Task Should_index_localized_content_without_stop_words_and_retrieve()
{
await TestCombinations(
Create(ids1[0], "de", "and und"),
Create(ids2[0], "en", "and und"),
Search(expected: ids1, text: "and"),
Search(expected: ids2, text: "und")
);
}
[Fact]
public async Task Should_index_cjk_content_and_retrieve()
{
await TestCombinations(
Create(ids1[0], "zh", "可以将正向最大匹配方法和"),
Search(expected: ids1, text: "大"),
Search(expected: ids1, text: "匹")
);
}
}
}

18
backend/tests/Squidex.Domain.Apps.Entities.Tests/Contents/Text/TextIndexerTests_Mongo.cs

@ -5,12 +5,15 @@
// All rights reserved. Licensed under the MIT license.
// ==========================================================================
using System.Linq;
using System.Threading.Tasks;
using MongoDB.Driver;
using Squidex.Domain.Apps.Entities.MongoDb.FullText;
using Squidex.Infrastructure;
using Xunit;
#pragma warning disable SA1115 // Parameter should follow comma
namespace Squidex.Domain.Apps.Entities.Contents.Text
{
[Trait("Category", "Dependencies")]
@ -42,7 +45,20 @@ namespace Squidex.Domain.Apps.Entities.Contents.Text
public TextIndexerTests_Mongo()
{
SupportsSearchSyntax = false;
SupportsMultiLanguage = false;
}
[Fact]
public async Task Should_index_localized_content_without_stop_words_and_retrieve()
{
var both = ids2.Union(ids1).ToList();
await TestCombinations(
Create(ids1[0], "de", "and und"),
Create(ids2[0], "en", "and und"),
Search(expected: both, text: "and"),
Search(expected: both, text: "und")
);
}
}
}
Loading…
Cancel
Save