This repository has been archived by the owner on Jun 10, 2019. It is now read-only.
forked from gogs/chardet
-
Notifications
You must be signed in to change notification settings - Fork 1
/
recognizer.go
83 lines (76 loc) · 1.59 KB
/
recognizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
package chardet
type recognizer interface {
Match(*recognizerInput) recognizerOutput
}
type recognizerOutput Result
type recognizerInput struct {
raw []byte
input []byte
tagStripped bool
byteStats []int
hasC1Bytes bool
}
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
input, stripped := mayStripInput(raw, stripTag)
byteStats := computeByteStats(input)
return &recognizerInput{
raw: raw,
input: input,
tagStripped: stripped,
byteStats: byteStats,
hasC1Bytes: computeHasC1Bytes(byteStats),
}
}
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
const inputBufferSize = 8192
out = make([]byte, 0, inputBufferSize)
var badTags, openTags int32
var inMarkup bool = false
stripped = false
if stripTag {
stripped = true
for _, c := range raw {
if c == '<' {
if inMarkup {
badTags += 1
}
inMarkup = true
openTags += 1
}
if !inMarkup {
out = append(out, c)
if len(out) >= inputBufferSize {
break
}
}
if c == '>' {
inMarkup = false
}
}
}
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
limit := len(raw)
if limit > inputBufferSize {
limit = inputBufferSize
}
out = make([]byte, limit)
copy(out, raw[:limit])
stripped = false
}
return
}
func computeByteStats(input []byte) []int {
r := make([]int, 256)
for _, c := range input {
r[c] += 1
}
return r
}
func computeHasC1Bytes(byteStats []int) bool {
for _, count := range byteStats[0x80 : 0x9F+1] {
if count > 0 {
return true
}
}
return false
}