您的位置：首页 > 理论基础 > 计算机网络

利用爬虫-C#下载网络上的小说大部分网站可以用

2018-10-22 15:24 141 查看

本文纯属个人爱好不含任何商业用途：
基本思路：爬取网站内容的基本规则是利用标签来进行匹配，首先我们以一个小说网站为例http://www.jjwxc.net/onebook.php?novelid=3325239&chapterid=6这是某个小说的一个章节，大家可以看到novelid=3325239这个在该网站中对应的就是小说的唯一表示chapterid=6对应的是小说的章节号那好根据这个我们可以访问打开任何小说包括他的任何章节。下载小说我们就需要拿到这个小说的名称查看网页源代码这里我选择的是在这里截取小说的名称源码

Regex ma_name = new Regex(@"<input type=""hidden"" id=""novel_name"" value=""(.+)""/>(.|\n)*?");
var mu_name = ma_name.Match(html);
text = mu_name.Groups[1].Value;

接下来获取章节名我们可以发现章节名很明显在h2中好那我们取出章节名源码

Regex reg_zjm = new Regex(@"< h2 >(.|\n)*?</h2>");
var mat_zjm = reg_zjm.Match(html);
string zjm = mat_zjm.Groups[0].ToString();
zjm = zjm.Replace("< h2 >", " ");
zjm = zjm.Replace("</ h2>", " ");

最后在获取文章的内容就可以了同样的方法不细说了直接上代码

Regex reg_mulu = new Regex(@"< br>(.|\n)*?</ div></td>");
var mat_mulu = reg_mulu.Match(html);
string nrong = mat_mulu.Groups[0].ToString();
// < br>< br/>替换为换行
nrong = nrong.Replace("< br>", "\r\n");
// 去掉其它的<>之间的东西
nrong = nrong.Split('d')[0];
nrong = nrong.Replace("<", "");
Content = "第" + i + "章" + zjm + "\r\n" + nrong + "\r\n";

最后输出文本

这里我是做成了一个winform输出的可以浏览到本地的任何路径选择路径的代码

//选择文件框 对象
FolderBrowserDialog ofd = new FolderBrowserDialog();
//打开时指定默认路径
ofd.SelectedPath = @"C:\Documents and Settings\Administrator.ICBCOA-6E96E6BE\桌面";
//如果用户点击确定
if (ofd.ShowDialog() == DialogResult.OK)
{
//将用户选择的文件路径 显示 在文本框中
txtFilePathOpen.Text = ofd.SelectedPath;
}
文本框中输入小说的标识 选中下载路径点击下载就可以了

完整代码
TextHelper.cs

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApp1
{
class TextHelper
{
/// <summary>
/// 创建文本
/// </summary>
/// <param name="content">内容</param>
/// <param name="name">名字</param>
/// <param name="path">路径</param>
public void Novel(string content, string name, string path)
{
string Log = content + "\r\n";
// 创建文件夹，如果不存在就创建file文件夹
if (Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}

// 判断文件是否存在，不存在则创建
if (!System.IO.File.Exists(path + "\\"+ name + ".txt"))
{
FileStream fs1 = new FileStream(path +"\\"+ name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);// 开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = new FileStream(path + "\\"+name + ".txt" + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);// 开始写入值
sr.Close();
fs.Close();
}
}

public string HttpPost(string Url, string postDataStr)
{
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();

return retString;
}

public string HttpGet(string Url, string postDataStr)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "GET";
//request.ContentType = "text/html;charset=GBK";
HttpWebResponse response;
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)request.GetResponse();
}

Stream stm = new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
StreamReader myStreamReader = new StreamReader(stm, Encoding.GetEncoding("GBK"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();

return retString;
}
}
}

Form1

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WindowsFormsApp1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{

}
public static string PathLog = "";
private void button1_Click(object sender, EventArgs e)
{
//选择文件框 对象
FolderBrowserDialog ofd = new FolderBrowserDialog();
//打开时指定默认路径
ofd.SelectedPath = @"C:\Documents and Settings\Administrator.ICBCOA-6E96E6BE\桌面";
//如果用户点击确定
if (ofd.ShowDialog() == DialogResult.OK)
{
//将用户选择的文件路径 显示 在文本框中
txtFilePathOpen.Text = ofd.SelectedPath;
PathLog = ofd.SelectedPath;
}
}

private void button2_Click(object sender, EventArgs e)
{
var path = PathLog;
//是否读取小说名称
var IsText = true;
//小说内容
var Content = "";
//小说标题
var text = "";
var novelid = textName.Text.ToString();
//抓取整本小说
//Thread thread = new Thread(() =>
//{
for (var i = 1; i < 2000; i++)
{

TextHelper cra = new TextHelper();
try
{
string html = cra.HttpGet("http://www.jjwxc.net/onebook.php?novelid=" + novelid + "&chapterid=" + i, "");

if (IsText)
{
//获取小说名字
Regex ma_name = new Regex(@"<input type=""hidden"" id=""novel_name"" value=""(.+)""/>(.|\n)*?");
var mu_name = ma_name.Match(html);
text = mu_name.Groups[1].Value;IsText = false;
}

//小说章节
Regex reg_zjm = new Regex(@"<h2>(.|\n)*?</h2>");
var mat_zjm = reg_zjm.Match(html);
string zjm = mat_zjm.Groups[0].ToString();
zjm = zjm.Replace("<h2>", " ");
zjm = zjm.Replace("</h2>", " ");
if (zjm == "")
{
continue;
}

// 小说内容
Regex reg_mulu = new Regex(@"<br>(.|\n)*?</div></td>");
var mat_mulu = reg_mulu.Match(html);
string nrong = mat_mulu.Groups[0].ToString();
// <br><br/>替换为换行
nrong = nrong.Replace("<br>", "\r\n");
// 去掉其它的<>之间的东西
nrong = nrong.Split('d')[0];
nrong = nrong.Replace("<", "");
Content = "第" + i + "章" + zjm + "\r\n" + nrong + "\r\n";
}
catch
{
}
// txt文本输出
cra.Novel(Content, text, path);
}

//});
//thread.IsBackground = true;
//thread.Start();
}

private void textBox1_TextChanged(object sender, EventArgs e)
{

}
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航