Go处理json响应并写入excel文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package main

import (
    "encoding/json"
    "fmt"
    "strconv"
    "time"

    "github.com/gocolly/colly/v2"
)

var result []map[string]string

const per_page = 100

type Ghresp []struct {
    Sha    string `json:"sha"`
    NodeID string `json:"node_id"`
    Author struct {
        UserID     uint32 `json:"id"`
        UserAvatar string `json:"avatar_url"`
        UserType   string `json:"type"`
    } `json:"author"`
    Commit struct {
        Author struct {
            AuthorName  string `json:"name"`
            AuthorEmail string `json:"email"`
        } `json:"author"`
        Committer struct {
            CommitDate string `json:"date"`
        } `json:"committer"`
        CommitMessage string `json:"message"`
    } `json:"commit"`
}

func main() {
    defer TraceTime()()

    c := colly.NewCollector(
        colly.Async(true),
        colly.AllowURLRevisit(),
        // colly.Debugger(&debug.LogDebugger{}),
    )

    c.Limit(&colly.LimitRule{
        DomainGlob:  "*",
        Parallelism: 30,
        // RandomDelay: 1 * time.Second,
    })

    // c.SetProxy("socks5://127.0.0.1:1081")

    c.OnRequest(func(r *colly.Request) {
        r.Ctx.Put("start", time.Now().Format(time.RFC3339Nano))
        r.Headers.Set("Accept", "application/vnd.github+json")
        r.Headers.Set("Authorization", "token xxxx")
        r.Headers.Set("X-GitHub-Api-Version", "2022-11-28")
    })

    c.OnError(func(r *colly.Response, err error) {
        fmt.Println(err, "Page:", r.Request.URL.Query().Get("page"))
        c.Visit(r.Request.URL.String()) // 重试
    })

    c.OnResponse(func(r *colly.Response) {
        starttimestr := r.Ctx.Get("start")
        starttime, _ := time.Parse(time.RFC3339Nano, starttimestr)
        fmt.Println("Page:", r.Request.URL.Query().Get("page"), "开始时间:\t", starttimestr, "耗时:", time.Since(starttime))
        if r.StatusCode == 200 {
            var ghresp Ghresp
            json.Unmarshal(r.Body, &ghresp)
            // fmt.Println("本次返回条数:\t", len(ghresp))
            for i := 0; i < len(ghresp); i++ {
                temp := map[string]string{
                    "Sha":           ghresp[i].Sha,
                    "CommitDate":    ghresp[i].Commit.Committer.CommitDate,
                    "CommitMessage": ghresp[i].Commit.CommitMessage,
                    "NodeID":        ghresp[i].NodeID,
                    "AuthorName":    ghresp[i].Commit.Author.AuthorName,
                    "AuthorID":      strconv.FormatUint(uint64(ghresp[i].Author.UserID), 10),
                    "AuthorAvatar":  ghresp[i].Author.UserAvatar,
                    "AuthorType":    ghresp[i].Author.UserType,
                    "AuthorEmail":   ghresp[i].Commit.Author.AuthorEmail,
                }
                result = append(result, temp)
            }
        }
    })
    for page_num := 1; page_num <= (117160+per_page-1)/per_page; page_num++ { //117110
        // c.Visit(fmt.Sprintf("https://api.github.com/repos/microsoft/vscode/commits?per_page=%d&page=%d", per_page, page_num))
        c.Visit(fmt.Sprintf("https://api.github.com/repos/freeCodeCamp/freeCodeCamp/stargazers?per_page=%d&page=%d", per_page, page_num))
    }
    c.Wait()
    fmt.Println("总计条数:\t", len(result))
    WriteMapsToXLSX(result, "github.xlsx")
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package main

import (
	"fmt"
	"strconv"
	"time"

	"github.com/gocolly/colly/v2"
	"github.com/tidwall/gjson"
	"github.com/xuri/excelize/v2"
)

var stop bool 

const per_page = 100

func main() {
	// Initialize Excel file
	f := excelize.NewFile()
	headers := []string{"Sha", "CommitDate", "CommitMessage", "NodeID", "AuthorName", "AuthorID", "AuthorAvatar", "AuthorType", "AuthorEmail"}
	for i, header := range headers {
		cell := fmt.Sprintf("%s%d", string(rune('A'+i)), 1)
		f.SetCellValue("Sheet1", cell, header)
	}

	row := 2
	c := colly.NewCollector(
		colly.Async(true),
		colly.AllowURLRevisit(),
		// colly.Debugger(&debug.LogDebugger{}),
	)

	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		Parallelism: 30,
		RandomDelay: 5 * time.Second,
	})

	// c.SetProxy("socks5://127.0.0.1:1081")

	c.OnRequest(func(r *colly.Request) {
		if stop { //如果stop标志为true,则不再执行请求
			r.Abort()
		}
		r.Ctx.Put("start", time.Now().Format(time.RFC3339Nano))
		r.Headers.Set("Accept", "application/vnd.github+json")
		r.Headers.Set("Authorization", "token xxxxxxxxx")
		r.Headers.Set("X-GitHub-Api-Version", "2022-11-28")
	})

	c.OnError(func(r *colly.Response, err error) {
		fmt.Println("Error:", err, ",Page:", r.Request.URL.Query().Get("page"), ",StatuCode:", r.StatusCode, ",RAW:", string(r.Body))
		stop = true
		// c.Visit(r.Request.URL.String()) // 重试
	})

	c.OnResponse(func(r *colly.Response) {
		starttimestr := r.Ctx.Get("start")
		starttime, _ := time.Parse(time.RFC3339Nano, starttimestr)
		fmt.Println("Page:", r.Request.URL.Query().Get("page"), ",开始于:", starttimestr, ",耗时:", time.Since(starttime))
		if r.StatusCode == 200 {
			// Parse JSON with gjson
			result := gjson.ParseBytes(r.Body).Array()
			for _, item := range result {
				sha := item.Get("sha").String()
				commitDate := item.Get("commit.committer.date").String()
				commitMessage := item.Get("commit.message").String()
				nodeID := item.Get("node_id").String()
				authorName := item.Get("commit.author.name").String()
				authorID := strconv.Itoa(int(item.Get("author.id").Int()))
				authorAvatar := item.Get("author.avatar_url").String()
				authorType := item.Get("author.type").String()
				authorEmail := item.Get("commit.author.email").String()

				// Write data to Excel
				values := []string{sha, commitDate, commitMessage, nodeID, authorName, authorID, authorAvatar, authorType, authorEmail}
				for i, value := range values {
					cell := fmt.Sprintf("%s%d", string(rune('A'+i)), row)
					f.SetCellValue("Sheet1", cell, value)
				}
				row++
			}
		}
	})
	for page_num := 1; page_num <= (117160+per_page-1)/per_page; page_num++ { //117110
		c.Visit(fmt.Sprintf("https://api.github.com/repos/microsoft/vscode/commits?per_page=%d&page=%d", per_page, page_num))
	}
	c.Wait()
	// Save Excel file
	if err := f.SaveAs("GitHubCommits.xlsx"); err != nil {
		fmt.Println(err)
	}

	fmt.Println("Total records:", row-2)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package main

import (
	"fmt"
	"time"

	"github.com/gocolly/colly/v2"
	"github.com/tidwall/gjson"
	"github.com/xuri/excelize/v2"
)


const per_page = 100

func main() {
	c := colly.NewCollector(
		colly.Async(true),
		colly.AllowURLRevisit(),
		// colly.Debugger(&debug.LogDebugger{}),
	)

	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		Parallelism: 30,
		RandomDelay: 1 * time.Second,
	})

	// c.SetProxy("socks5://127.0.0.1:1081")

	c.OnRequest(func(r *colly.Request) {
		r.Ctx.Put("start", time.Now().Format(time.RFC3339Nano))
		r.Headers.Set("Accept", "application/vnd.github+json")
		r.Headers.Set("Authorization", "token xxxxx")
		r.Headers.Set("X-GitHub-Api-Version", "2022-11-28")
	})

	c.OnError(func(r *colly.Response, err error) {
		fmt.Println("Error:", err, ",Page:", r.Request.URL.Query().Get("page"), ",StatuCode:", r.StatusCode, ",RAW:", string(r.Body))
		c.Visit(r.Request.URL.String()) // 重试
	})
	// 创建一个新的 XLSX 文件
	f := excelize.NewFile()
	// 创建一个工作表
	index, _ := f.NewSheet("Sheet1")

	// 设置表头
	headers := []string{"login", "id", "node_id", "avatar_url", "gravatar_id", "url", "html_url", "followers_url", "following_url", "gists_url", "starred_url", "subscriptions_url", "organizations_url", "repos_url", "events_url", "received_events_url", "type", "site_admin"}
	for i, header := range headers {
		column := string(rune('A' + i))
		cell := fmt.Sprintf("%s%d", column, 1)
		f.SetCellValue("Sheet1", cell, header)
	}
	// 动态跟踪行号
	row := 2
	c.OnResponse(func(r *colly.Response) {
		starttimestr := r.Ctx.Get("start")
		starttime, _ := time.Parse(time.RFC3339Nano, starttimestr)
		fmt.Println("Page:", r.Request.URL.Query().Get("page"), ",开始于:", starttimestr, ",耗时:", time.Since(starttime))
		if r.StatusCode == 200 {
			// 解析 JSON 数据
			gresults := gjson.ParseBytes(r.Body)
			// 遍历 JSON 数组,并写入数据
			gresults.ForEach(func(key, value gjson.Result) bool {
				// 使用循环迭代字段
				for i, colname := range headers {
					column := string(rune('A' + i))
					cell := fmt.Sprintf("%s%d", column, row)
					f.SetCellValue("Sheet1", cell, value.Get(colname).Value())
				}
				row++       // 移动到下一行
				return true // 继续遍历
			})
		}
	})
	for page_num := 1; page_num <= (379+per_page-1)/per_page; page_num++ {
		// c.Visit(fmt.Sprintf("https://api.github.com/repos/microsoft/vscode/commits?per_page=%d&page=%d", per_page, page_num))
		c.Visit(fmt.Sprintf("https://api.github.com/repos/freeCodeCamp/freeCodeCamp/stargazers?per_page=%d&page=%d", per_page, page_num)) //379226
	}
	c.Wait()
	fmt.Println("总计条数:\t", row-2)
	// 设置默认打开的工作表
	f.SetActiveSheet(index)
	// 保存文件
	if err := f.SaveAs("GitHubUsers.xlsx"); err != nil {
		fmt.Println(err)
	}
}