Skip to content

Commit c61fc74

Browse files
authored
Cherry picked #19540 to V16 (and fixed changed signatures) (#19592)
1 parent 67106f0 commit c61fc74

File tree

2 files changed

+103
-1
lines changed

2 files changed

+103
-1
lines changed

src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using Microsoft.Extensions.Logging;
22
using Microsoft.Extensions.Options;
3+
using System.Text.RegularExpressions;
34
using Umbraco.Cms.Core.Configuration.Models;
45
using Umbraco.Cms.Core.Models;
56
using Umbraco.Cms.Core.Serialization;
@@ -50,7 +51,7 @@ public override IEnumerable<IndexValue> GetIndexValues(
5051
};
5152

5253
// the actual content (RTE content without markup, i.e. the actual words) must be indexed under the property alias
53-
var richTextWithoutMarkup = richTextEditorValue.Markup.StripHtml();
54+
var richTextWithoutMarkup = StripHtmlForIndexing(richTextEditorValue.Markup);
5455
if (richTextEditorValue.Blocks?.ContentData.Any() is not true)
5556
{
5657
// no blocks; index the content for the culture and be done with it
@@ -132,4 +133,27 @@ public override IEnumerable<IndexValue> GetIndexValues(
132133

133134
protected override IEnumerable<RawDataItem> GetDataItems(RichTextEditorValue input, bool published)
134135
=> GetDataItems(input.Blocks?.ContentData ?? [], input.Blocks?.Expose ?? [], published);
136+
137+
/// <summary>
138+
/// Strips HTML tags from content while preserving whitespace from line breaks.
139+
/// This addresses the issue where &lt;br&gt; tags don't create word boundaries when HTML is stripped.
140+
/// </summary>
141+
/// <param name="html">The HTML content to strip</param>
142+
/// <returns>Plain text with proper word boundaries</returns>
143+
private static string StripHtmlForIndexing(string html)
144+
{
145+
if (string.IsNullOrWhiteSpace(html))
146+
{
147+
return string.Empty;
148+
}
149+
150+
// Replace <br> and <br/> tags (with any amount of whitespace and attributes) with spaces
151+
// This regex matches:
152+
// - <br> (with / without spaces or attributes)
153+
// - <br /> (with / without spaces or attributes)
154+
html = Regex.Replace(html, @"<br\b[^>]*/?>\s*", " ", RegexOptions.IgnoreCase);
155+
156+
// Use the existing Microsoft StripHtml function for everything else
157+
return html.StripHtml();
158+
}
135159
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
using Microsoft.Extensions.Logging;
2+
using Microsoft.Extensions.Options;
3+
using Moq;
4+
using NUnit.Framework;
5+
using Umbraco.Cms.Core.Configuration.Models;
6+
using Umbraco.Cms.Core.Models;
7+
using Umbraco.Cms.Core.PropertyEditors;
8+
using Umbraco.Cms.Core.Serialization;
9+
10+
namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.PropertyEditors;
11+
12+
/// <summary>
13+
/// Tests for <see cref="RichTextPropertyIndexValueFactory"/> to ensure it correctly creates index values from rich text properties.
14+
/// </summary>
15+
public class RichTextPropertyIndexValueFactoryTests
16+
{
17+
/// <summary>
18+
/// Tests that the factory can create index values from a rich text property with valid content
19+
/// </summary>
20+
/// <param name="testContent"></param>
21+
/// <param name="expected"></param>
22+
[TestCase("<p>Sample text</p>", "Sample text")]
23+
[TestCase("<p>John Smith<br>Company ABC<br>London</p>", "John Smith Company ABC London")]
24+
[TestCase("<p>John Smith<break>Company ABC<break>London</p>", "John SmithCompany ABCLondon")]
25+
[TestCase("<p>John Smith<br>Company ABC<branything>London</p>", "John Smith Company ABCLondon")]
26+
[TestCase("<p>Another sample text with <strong>bold</strong> content</p>", "Another sample text with bold content")]
27+
[TestCase("<p>Text with <a href=\"https://example.com\">link</a></p>", "Text with link")]
28+
[TestCase("<p>Text with <img src=\"image.jpg\" alt=\"image\" /></p>", "Text with")]
29+
[TestCase("<p>Text with <span style=\"color: red;\">styled text</span></p>", "Text with styled text")]
30+
[TestCase("<p>Text with <em>emphasized</em> content</p>", "Text with emphasized content")]
31+
[TestCase("<p>Text with <u>underlined</u> content</p>", "Text with underlined content")]
32+
[TestCase("<p>Text with <code>inline code</code></p>", "Text with inline code")]
33+
[TestCase("<p>Text with <pre><code>code block</code></pre></p>", "Text with code block")]
34+
[TestCase("<p>Text with <blockquote>quoted text</blockquote></p>", "Text with quoted text")]
35+
[TestCase("<p>Text with <ul><li>list item 1</li><li>list item 2</li></ul></p>",
36+
"Text with list item 1list item 2")]
37+
[TestCase("<p>Text with <ol><li>ordered item 1</li><li>ordered item 2</li></ol></p>",
38+
"Text with ordered item 1ordered item 2")]
39+
[TestCase("<p>Text with <div class=\"class-name\">div content</div></p>", "Text with div content")]
40+
[TestCase("<p>Text with <span class=\"class-name\">span content</span></p>", "Text with span content")]
41+
[TestCase("<p>Text with <strong>bold</strong> and <em>italic</em> content</p>",
42+
"Text with bold and italic content")]
43+
[TestCase("<p>Text with <a href=\"https://example.com\" target=\"_blank\">external link</a></p>",
44+
"Text with external link")]
45+
[TestCase("<p>John Smith<br class=\"test\">Company ABC<br>London</p>", "John Smith Company ABC London")]
46+
[TestCase("<p>John Smith<br \r\n />Company ABC<br>London</p>", "John Smith Company ABC London")]
47+
public void Can_Create_Index_Values_From_RichText_Property(string testContent, string expected)
48+
{
49+
var propertyEditorCollection = new PropertyEditorCollection(new DataEditorCollection(() => null));
50+
var jsonSerializer = Mock.Of<IJsonSerializer>();
51+
var indexingSettings = Mock.Of<IOptionsMonitor<IndexingSettings>>();
52+
Mock.Get(indexingSettings).Setup(x => x.CurrentValue).Returns(new IndexingSettings { });
53+
var logger = Mock.Of<ILogger<RichTextPropertyIndexValueFactory>>();
54+
string alias = "richText";
55+
56+
var factory = new RichTextPropertyIndexValueFactory(
57+
propertyEditorCollection,
58+
jsonSerializer,
59+
indexingSettings,
60+
logger);
61+
62+
// create a mock property with the rich text value
63+
var property = Mock.Of<IProperty>(p => p.Alias == alias
64+
&& (string)p.GetValue(It.IsAny<string>(), It.IsAny<string>(),
65+
It.IsAny<bool>()) == testContent);
66+
67+
// get the index value for the property
68+
var indexValue = factory
69+
.GetIndexValues(property, null, null, true, [], new Dictionary<Guid, IContentType>())
70+
.FirstOrDefault(kvp => kvp.FieldName == alias);
71+
Assert.IsNotNull(indexValue);
72+
73+
// assert that index the value is created correctly (it might contain a trailing whitespace, but that's OK)
74+
var expectedIndexValue = indexValue.Values.SingleOrDefault() as string;
75+
Assert.IsNotNull(expectedIndexValue);
76+
Assert.AreEqual(expected, expectedIndexValue.TrimEnd());
77+
}
78+
}

0 commit comments

Comments
 (0)