您的位置:首页 > 编程语言 > Go语言

go 简单爬虫

2016-02-02 11:04 399 查看

通过xpath

未加入线程,channel机制,自行加一下即可

使用sqlite

[code]package main

import (
    "database/sql"
    "fmt"
    "io/ioutil"
    "log"
    "net/http"
    "os"
    "strings"

    _ "github.com/mattn/go-sqlite3"
    "github.com/moovweb/gokogiri"
    "github.com/moovweb/gokogiri/xpath"
)

type Doub struct {
    id   int
    name string
    url  string
    info string
}

func main() {

    // 获取并读取网页
    resp, _ := http.Get("http://movie.douban.com/tv/")

    page, _ := ioutil.ReadAll(resp.Body)

    // 解析web
    doc, err := gokogiri.ParseHtml(page)

    if err != nil {
        fmt.Println("Parsing err")
        return
    }

    //  提供xpath
    xps1 := xpath.Compile("//div/table/tr[@class='item']/td[2]/div[@class='pl2']/a")
    //  按xpath搜索
    ss1, err := doc.Root().Search(xps1)

    // sqlite 3
    os.Remove("./foo.db")
    db, err := sql.Open("sqlite3", "./foo.db")
    if err != nil {
        log.Fatal(err)
    }
    defer db.Close()

    //  建表
    sql := `create table douban(id integer primary key autoincrement , name text, url text, info text);`
    db.Exec(sql)

    tx, err := db.Begin()
    if err != nil {
        log.Fatal(err)
    }
    // 插入准备
    stmt, err := tx.Prepare("insert into douban(id, name, url, info) values(?,?,?,?);")
    if err != nil {
        log.Fatal(err)
    }
    defer stmt.Close()
    //  执行插入
    for _, s1 := range ss1 {

        resp, _ := http.Get(s1.Attributes()["href"].String())
        page, _ := ioutil.ReadAll(resp.Body)
        doc, _ := gokogiri.ParseHtml(page)

        xps2 := xpath.Compile("//div[@id='info']")
        ss2, _ := doc.Root().Search(xps2)
        for _, s2 := range ss2 {

            _, err = stmt.Exec(nil, strings.Split(s1.Content(), "/")[0], s1.Attributes()["href"].String(), s2.Content())
            if err != nil {
                log.Fatal(err)
            }

        }
    }
    tx.Commit()

    // 查询
    var douban []Doub = make([]Doub, 0)

    rows, err := db.Query("select id, name, url, info from douban")
    if err != nil {
        log.Fatal(err)
    }
    defer rows.Close()

    for rows.Next() {
        var doub Doub
        rows.Scan(&doub.id, &doub.name, &doub.url, &doub.info)
        douban = append(douban, doub)
    }

    fmt.Println(douban)

    rows.Close()

    defer doc.Free()

}


使用mysql

[code]package main

import (
    "database/sql"
    "fmt"
    "io/ioutil"
    "log"
    "net/http"
    "strings"

    _ "github.com/go-sql-driver/mysql"
    "github.com/moovweb/gokogiri"
    "github.com/moovweb/gokogiri/xpath"
)

type Doub struct {
    id   int
    name string
    url  string
    info string
}

func main() {

    // 获取并读取网页
    resp, _ := http.Get("http://movie.douban.com/tv/")

    page, _ := ioutil.ReadAll(resp.Body)

    // 解析web
    doc, err := gokogiri.ParseHtml(page)

    if err != nil {
        fmt.Println("Parsing err")
        return
    }

    //  提供xpath
    xps1 := xpath.Compile("//div/table/tr[@class='item']/td[2]/div[@class='pl2']/a")
    //  按xpath搜索
    ss1, err := doc.Root().Search(xps1)

    // mysql
    db, err := sql.Open("mysql", "root:root@/test")
    if err != nil {
        log.Fatalf("Open database error: %s\n", err)
    }
    defer db.Close()

    tx, err := db.Begin()
    if err != nil {
        log.Fatal(err)
    }

    // 插入准备
    stmt, err := db.Prepare("insert into douban(name, url, info) values(?,?,?);")
    defer stmt.Close()

    if err != nil {
        log.Println(err)
        return
    }

    //  执行插入
    for _, s1 := range ss1 {

        resp, _ := http.Get(s1.Attributes()["href"].String())
        page, _ := ioutil.ReadAll(resp.Body)
        doc, _ := gokogiri.ParseHtml(page)

        xps2 := xpath.Compile("//div[@id='info']")
        ss2, _ := doc.Root().Search(xps2)
        for _, s2 := range ss2 {

            _, err = stmt.Exec(strings.Split(s1.Content(), "/")[0], s1.Attributes()["href"].String(), s2.Content())
            if err != nil {
                log.Fatal(err)
            }

        }
    }
    tx.Commit()

    // 查询
    var douban []Doub = make([]Doub, 0)

    rows, err := db.Query("select id, name, url, info from douban")
    if err != nil {
        log.Fatal(err)
    }
    defer rows.Close()

    for rows.Next() {
        var doub Doub
        rows.Scan(&doub.id, &doub.name, &doub.url, &doub.info)
        douban = append(douban, doub)
    }

    fmt.Println(douban)

    rows.Close()

    defer doc.Free()

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: