本博客由 [Pipe](https://github.com/b3log/pipe) 强力驱动

周末用 Go 写一篇的爬虫

前几天和 BBAE 的销售大佬聊天, 他说他家的网站内容随便爬, 我说好, 所以这就趁着周末撸了一个爬虫; 其中这个爬虫特意用 FAN-OUT 和 FAN-IN 模式, 对这块感谢的同学可以自己看一下了。

代码如下:


package jobs

import (
	"bytes"
	"encoding/json"
	"fmt"
	"github.com/gocolly/colly"
	"net/http"
	"sync"
)

type Article struct{
	Title string `json:"title"`
	SourceUrl   string `json:"source_url"`
	Body  string `json:"body"`
}

func SpiderList() <- chan Article{
	out := make(chan Article, 100)
  go func(){
  	defer close(out)
		c := colly.NewCollector()

		c.OnHTML("div.la-list02 a.htitle-color", func(r *colly.HTMLElement){
			body := string(r.Text)
			article := Article{SourceUrl: r.Attr("href"), Title: body}
			out <- article
		})
		c.Visit("https://5imeigu.com/")
	}()
	return out
}

func SpiderDetail(articles <- chan Article) <- chan Article{
	out := make(chan Article, 10)
	go func(){
		defer close(out)
		for article := range articles{
			c := colly.NewCollector()
			c.OnHTML("div.data-article", func(r *colly.HTMLElement){
				article.Body = string(r.Response.Body)
				out <- article
			})
	    c.Visit(article.SourceUrl)
		}
	}()

	return out
}

func Merge(inputs ...<- chan Article) <- chan Article{
	out := make(chan Article)

	var group sync.WaitGroup

	collect := func( in <- chan Article){
		defer group.Done()
		for n := range(in){
			out <- n
		}
	}

	group.Add(len(inputs))

	for _, in := range inputs{
		go collect(in)
	}

	go func() {
		group.Wait()
		fmt.Println("全部完成....")
		close(out)
	}()

	return out
}

func SyncTopicToInvest(article Article){
	url := "https://investguider.com/api接口之马赛克"
	buf := new(bytes.Buffer)
	json.NewEncoder(buf).Encode(&article)
	req, err := http.NewRequest("POST", url, buf)
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{}
	resp, err := client.Do(req)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
}

func SpiderMeigu() interface{}{
	 listChannel := SpiderList()

	 detailChan1 := SpiderDetail(listChannel)
	 detailChan2 := SpiderDetail(listChannel)

	 for a := range Merge(detailChan1, detailChan2){
	 		fmt.Println("完成: ", a.SourceUrl)
	 		SyncTopicToInvest(a)
	 }

	success := map[string]string{"title": "success"}
	return SuccessResp(success)
}

运行结果如下:

2019/12/01 15:21:10 start consume persist.5imeigu
完成:  https://5imeigu.com/archives/1476
完成:  https://5imeigu.com/archives/1466
完成:  https://5imeigu.com/archives/1453
完成:  https://5imeigu.com/archives/1450
完成:  https://5imeigu.com/archives/1445
完成:  https://5imeigu.com/archives/1442
完成:  https://5imeigu.com/archives/1440
完成:  https://5imeigu.com/archives/1430
完成:  https://5imeigu.com/archives/1414
完成:  https://5imeigu.com/archives/1413
全部完成....

留下你的脚步