Improve robustness of charset handling when parsing HTML.

artyom · artyom · commit 69c28711c74f · 2024-06-14T15:07:15.000+02:00
Between the server-provided and automatically detected Content-Type of
text types, use the value that includes charset specification.

This handles the case when the server sends a plain "text/html" Content-Type,
but the code detects it to be a more specific "text/html; charset=utf-8".
diff --git a/html_meta_parser.go b/html_meta_parser.go
@@ -17,7 +17,8 @@ import (
 
 func basicParseHTML(chunk *pageChunk) *unfurlResult {
 	result := new(unfurlResult)
-	result.Type = http.DetectContentType(chunk.data)
+	sniffedContentType := http.DetectContentType(chunk.data)
+	result.Type = sniffedContentType
 	switch {
 	case strings.HasPrefix(result.Type, "image/"):
 		result.Type = "image"
@@ -26,7 +27,14 @@ func basicParseHTML(chunk *pageChunk) *unfurlResult {
 		result.Type = "website"
 		// pass Content-Type from response headers as it may have
 		// charset definition like "text/html; charset=windows-1251"
-		if title, desc, err := extractData(chunk.data, chunk.ct); err == nil {
+		ct := chunk.ct
+		// There are cases where Content-Type header is "text/html", but http.DetectContentType
+		// narrows it down to a more specific "text/html; charset=utf-8". In such a case use
+		// the latter.
+		if !strings.Contains(ct, "charset=") && strings.Contains(sniffedContentType, "charset=") {
+			ct = sniffedContentType
+		}
+		if title, desc, err := extractData(chunk.data, ct); err == nil {
 			result.Title = title
 			result.Description = desc
 		}