1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
package main
import (
"fmt"
"time"
"github.com/gocolly/colly/v2"
"github.com/tidwall/gjson"
"github.com/xuri/excelize/v2"
)
const per_page = 100
func main() {
c := colly.NewCollector(
colly.Async(true),
colly.AllowURLRevisit(),
// colly.Debugger(&debug.LogDebugger{}),
)
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 30,
RandomDelay: 1 * time.Second,
})
// c.SetProxy("socks5://127.0.0.1:1081")
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("start", time.Now().Format(time.RFC3339Nano))
r.Headers.Set("Accept", "application/vnd.github+json")
r.Headers.Set("Authorization", "token xxxxx")
r.Headers.Set("X-GitHub-Api-Version", "2022-11-28")
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Error:", err, ",Page:", r.Request.URL.Query().Get("page"), ",StatuCode:", r.StatusCode, ",RAW:", string(r.Body))
c.Visit(r.Request.URL.String()) // 重试
})
// 创建一个新的 XLSX 文件
f := excelize.NewFile()
// 创建一个工作表
index, _ := f.NewSheet("Sheet1")
// 设置表头
headers := []string{"login", "id", "node_id", "avatar_url", "gravatar_id", "url", "html_url", "followers_url", "following_url", "gists_url", "starred_url", "subscriptions_url", "organizations_url", "repos_url", "events_url", "received_events_url", "type", "site_admin"}
for i, header := range headers {
column := string(rune('A' + i))
cell := fmt.Sprintf("%s%d", column, 1)
f.SetCellValue("Sheet1", cell, header)
}
// 动态跟踪行号
row := 2
c.OnResponse(func(r *colly.Response) {
starttimestr := r.Ctx.Get("start")
starttime, _ := time.Parse(time.RFC3339Nano, starttimestr)
fmt.Println("Page:", r.Request.URL.Query().Get("page"), ",开始于:", starttimestr, ",耗时:", time.Since(starttime))
if r.StatusCode == 200 {
// 解析 JSON 数据
gresults := gjson.ParseBytes(r.Body)
// 遍历 JSON 数组,并写入数据
gresults.ForEach(func(key, value gjson.Result) bool {
// 使用循环迭代字段
for i, colname := range headers {
column := string(rune('A' + i))
cell := fmt.Sprintf("%s%d", column, row)
f.SetCellValue("Sheet1", cell, value.Get(colname).Value())
}
row++ // 移动到下一行
return true // 继续遍历
})
}
})
for page_num := 1; page_num <= (379+per_page-1)/per_page; page_num++ {
// c.Visit(fmt.Sprintf("https://api.github.com/repos/microsoft/vscode/commits?per_page=%d&page=%d", per_page, page_num))
c.Visit(fmt.Sprintf("https://api.github.com/repos/freeCodeCamp/freeCodeCamp/stargazers?per_page=%d&page=%d", per_page, page_num)) //379226
}
c.Wait()
fmt.Println("总计条数:\t", row-2)
// 设置默认打开的工作表
f.SetActiveSheet(index)
// 保存文件
if err := f.SaveAs("GitHubUsers.xlsx"); err != nil {
fmt.Println(err)
}
}
|