Skip to content

Commit 69c2871

Browse files
committed
Improve robustness of charset handling when parsing HTML.
Between the server-provided and automatically detected Content-Type of text types, use the value that includes charset specification. This handles the case when the server sends a plain "text/html" Content-Type, but the code detects it to be a more specific "text/html; charset=utf-8".
1 parent 404c3cf commit 69c2871

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

html_meta_parser.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ import (
1717

1818
func basicParseHTML(chunk *pageChunk) *unfurlResult {
1919
result := new(unfurlResult)
20-
result.Type = http.DetectContentType(chunk.data)
20+
sniffedContentType := http.DetectContentType(chunk.data)
21+
result.Type = sniffedContentType
2122
switch {
2223
case strings.HasPrefix(result.Type, "image/"):
2324
result.Type = "image"
@@ -26,7 +27,14 @@ func basicParseHTML(chunk *pageChunk) *unfurlResult {
2627
result.Type = "website"
2728
// pass Content-Type from response headers as it may have
2829
// charset definition like "text/html; charset=windows-1251"
29-
if title, desc, err := extractData(chunk.data, chunk.ct); err == nil {
30+
ct := chunk.ct
31+
// There are cases where Content-Type header is "text/html", but http.DetectContentType
32+
// narrows it down to a more specific "text/html; charset=utf-8". In such a case use
33+
// the latter.
34+
if !strings.Contains(ct, "charset=") && strings.Contains(sniffedContentType, "charset=") {
35+
ct = sniffedContentType
36+
}
37+
if title, desc, err := extractData(chunk.data, ct); err == nil {
3038
result.Title = title
3139
result.Description = desc
3240
}

0 commit comments

Comments
 (0)