最近要用 golang
写一个需要解析 HTML
的项目,到网上找了一个库叫 goquery
。虽然它的 API
挺不错,css selector
基本上也全支持了,但写这种代码果然还是有点无聊,于是我就想,为什么不能跟 go
的 json
库和 xml
库一样,直接 Unmarshal(HTML)
呢?
然后我花了两天时间撸出了 unhtml
-> Github
样例 & 性能
有个 HTML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
var AllTypeHTML = []byte(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<ul>
<li>0</li>
<li>1</li>
<li>2</li>
<li>3</li>
</ul>
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
<p>Hello World!</p>
<p>10</p>
<p>3.14</p>
<p>true</p>
</div>
</body>
</html>
`)
如果你想把它解析为一个结构体
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package example
type (
PartTypesStruct struct {
Slice []int
Struct TestUser
String string
Int int
Float64 float64
Bool bool
}
TestUser struct {
Name string
Age uint
LikeLemon bool
}
)
直接用 goquery
要这样写
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
package example
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"strconv"
)
func parsePartTypesLogically() (PartTypesStruct, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML))
partTypes := PartTypesStruct{}
if err == nil {
selection := doc.Find(partTypes.Root())
partTypes.Slice = make([]int, 0)
selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) {
Int, parseErr := strconv.Atoi(selection.Text())
if parseErr != nil {
err = parseErr
}
partTypes.Slice = append(partTypes.Slice, Int)
})
if err == nil {
partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text()
Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text())
if err = parseErr; err == nil {
partTypes.Struct.Age = uint(Int)
Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text())
if err = parseErr; err == nil {
partTypes.Struct.LikeLemon = Bool
String := selection.Find(`#test > p:nth-child(3)`).Text()
Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
if err = parseErr; err != nil {
return partTypes, err
}
Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
if err = parseErr; err != nil {
return partTypes, err
}
Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text())
if err = parseErr; err != nil {
return partTypes, err
}
partTypes.String = String
partTypes.Int = Int
partTypes.Float64 = Float64
partTypes.Bool = Bool
}
}
}
}
return partTypes, err
}
写得很难受
而现在你只要这么写
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
package main
import (
"encoding/json"
"fmt"
"github.com/Hexilee/unhtml"
"io/ioutil"
)
type (
PartTypesStruct struct {
Slice []int `html:"ul > li"`
Struct TestUser `html:"#test > div"`
String string `html:"#test > p:nth-child(3)"`
Int int `html:"#test > p:nth-child(4)"`
Float64 float64 `html:"#test > p:nth-child(5)"`
Bool bool `html:"#test > p:nth-child(6)"`
}
TestUser struct {
Name string `html:"p:nth-child(1)"`
Age uint `html:"p:nth-child(2)"`
LikeLemon bool `html:"p:nth-child(3)"`
}
)
func (PartTypesStruct) Root() string {
return "#test"
}
func main() {
allTypes := PartTypesStruct{}
_ := unhtml.Unmarshal(AllTypeHTML, &allTypes)
result, _ := json.Marshal(&allTypes)
fmt.Println(string(result))
}
就能得到结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
{
"Slice": [
0,
1,
2,
3
],
"Struct": {
"Name": "Hexilee",
"Age": 20,
"LikeLemon": true
},
"String": "Hello World!",
"Int": 10,
"Float64": 3.14,
"Bool": true
}
开发效率大大提升!但毫无疑问用了大量反射,让人担心它的运行效率。于是我写了两个 Benchmarks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
func BenchmarkUnmarshalPartTypes(b *testing.B) {
assert.NotNil(b, AllTypeHTML)
for i := 0; i < b.N; i++ {
partTypes := PartTypesStruct{}
assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes))
}
}
func BenchmarkParsePartTypesLogically(b *testing.B) {
assert.NotNil(b, AllTypeHTML)
for i := 0; i < b.N; i++ {
_, err := parsePartTypesLogically()
assert.Nil(b, err)
}
}
测试结果:
1
2
3
4
5
6
7
8
> go test -bench=.
goos: darwin
goarch: amd64
pkg: github.com/Hexilee/unhtml
BenchmarkUnmarshalPartTypes-4 30000 54096 ns/op
BenchmarkParsePartTypesLogically-4 30000 45188 ns/op
PASS
ok github.com/Hexilee/unhtml 4.098s
运行效率稍微低些,但这只是展示和测试用的 HTML
,在解析实际中更复杂的 HTML
时两者的运行效率是十分接近的。
一些注意事项和特性请看 README