您的位置:首页 > 产品设计 > UI/UE

Golang---goquery爬虫获取golang语言中文网页面信息并保存MySQL

2017-11-12 16:02 771 查看
    由于最近一直在golang语言中文网上看帖子,所以打算使用golang写一个爬虫把帖子信息抓取下来,并保存到mysql中。

    以下是完整代码;

// goquery_spider_test project main.go
package main

import (
"database/sql"
"fmt"
"goquery-master"
"log"
//	"net/http"
"strconv"
"strings"

_ "github.com/go-sql-driver/mysql"
)

var (
commonurl = "https://studygolang.com"
userid    = ""
//获取数据
topicinfo = []string{"", "", "", "", ""}
)

type DbWorker struct {
//mysql data source name
Dsn string
}

func main() {
//连接MySQL
dbw := DbWorker{
Dsn: "root:haige@tcp(localhost:3306)/studygolang_topic?charset=utf8",
}
db, err := sql.Open("mysql",
dbw.Dsn)
defer db.Close()
if err != nil {
panic(err)
} else {
fmt.Println("数据库链接成功!")
}

var pagelasturl string = ""
var index int = 0

//获取页面数据信息
pageurl := "https://studygolang.com/topics?p="

for {
index++
temp := strconv.Itoa(index)
pagelasturl = pageurl + temp
doc, _ := goquery.NewDocument(pagelasturl)
if doc.Find("div.topic").Text() == "" {
break
}
getpageinfo(db, pagelasturl)
fmt.Println(pagelasturl)
temp = ""
pagelasturl = ""
}
defer db.Close()
}

func getpageinfo(db *sql.DB, pagefullurl string) {
doc, err := goquery.NewDocument(pagefullurl)
if err != nil {
log.Fatal(err)
}
doc.Find("div.topic").Each(func(i int, contentSelection *goquery.Selection) {
//帖子标题
title := contentSelection.Find("div.title a").Text()
//帖子url
topicurl, _ := contentSelection.Find("div.title a").Eq(0).Attr("href")
//帖子作者
userid = contentSelection.Find("div.meta a").Find("strong").Text()
//帖子作者url
userurl, _ := contentSelection.Find("dt.avatar a").Eq(0).Attr("href")
//帖子回复
click := contentSelection.Find("div.pull-right a").Text()
if click == "" {
click = "0"
}
topicinfo[0] = title
topicinfo[1] = commonurl + topicurl
topicinfo[2] = userid
topicinfo[3] = commonurl + userurl
topicinfo[4] = click

//消除各种不兼容字符
topicinfo = splitstring(topicinfo)
//数据库操作
dbmanager(db, topicinfo)
})
}

func splitstring(pageinfo []string) []string {
spilitinfo := pageinfo
for i := 0; i < 5; i++ {
spilitinfo[i] = strings.Replace(pageinfo[i], "'''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "'", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "''", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "’", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "‘", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "“", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "”", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], ",", " ", -1)
spilitinfo[i] = strings.Replace(pageinfo[i], "?", " ", -1)
}
return spilitinfo
}

func dbmanager(db *sql.DB, info []string) {
//	fmt.Print(info[0])
var sqlinfo string = "INSERT INTO golang_topic VALUES('" + info[0] + "','" + info[1] + "','" + info[2] + "','" + info[3] + "','" + info[4] + "')"
stmt, err := db.Prepare(sqlinfo)
if err != nil {
fmt.Println("insert data error: %v\n", err)
return
}
stmt.Exec()
}


步骤:

1、链接mysql

2、设置抓取规则,使用goquery解析html网页,获取想要的字段信息

3、保存mysql (字段为:topic_name(帖子名称), topic_url(帖子url), topic_userid (帖子发布作者), topic_userurl (作者url), topic_click (帖子回复数量))

      由于刚开始golang的学习,对其中字符串的函数使用不熟练,在向mysql插入数据的时候,由于一些转义字符或者是特殊字符无法完成插入,后通过strings.Replace()函数来进行笨拙的替换,后期会继续学习并优化函数处理以及爬虫规模

     代码中需要加载一些包,在此就不详细描述,如有需要,可留言回复,共同学习。

下面是抓取的mysql截图



    Go语言中文网页面总共有81页,3416条数据,抓取到mysql以每页1000条显示(Navicat for MySQL)

   由于刚接触golang,如有不足,请互相交流学习
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐