您的位置:首页 > 理论基础 > 计算机网络

利用爬虫-C#下载网络上的小说大部分网站可以用

2018-10-22 15:24 141 查看

本文纯属个人爱好不含任何商业用途:
基本思路:爬取网站内容的基本规则是利用标签来进行匹配,首先我们以一个小说网站为例http://www.jjwxc.net/onebook.php?novelid=3325239&chapterid=6这是某个小说的一个章节,大家可以看到novelid=3325239这个在该网站中对应的就是小说的唯一表示chapterid=6对应的是小说的章节号 那好根据这个我们可以访问打开任何小说包括他的任何章节。下载小说我们就需要拿到这个小说的名称查看网页源代码 这里我选择的是在这里截取小说的名称 源码

Regex ma_name = new Regex(@"<input type=""hidden"" id=""novel_name"" value=""(.+)""/>(.|\n)*?");
var mu_name = ma_name.Match(html);
text = mu_name.Groups[1].Value;

接下来获取章节名 我们可以发现章节名很明显在h2中 好那我们取出章节名 源码

Regex reg_zjm = new Regex(@"< h2 >(.|\n)*?</h2>");
var mat_zjm = reg_zjm.Match(html);
string zjm = mat_zjm.Groups[0].ToString();
zjm = zjm.Replace("< h2 >", " ");
zjm = zjm.Replace("</ h2>", " ");

最后在获取文章的内容就可以了同样的方法不细说了 直接上代码

Regex reg_mulu = new Regex(@"< br>(.|\n)*?</ div></td>");
var mat_mulu = reg_mulu.Match(html);
string nrong = mat_mulu.Groups[0].ToString();
// < br>< br/>替换为换行
nrong = nrong.Replace("< br>", "\r\n");
// 去掉其它的<>之间的东西
nrong = nrong.Split('d')[0];
nrong = nrong.Replace("<", "");
Content = "第" + i + "章" + zjm + "\r\n" + nrong + "\r\n";

最后输出文本

这里我是做成了一个winform输出的 可以浏览到本地的任何路径 选择路径的代码

//选择文件框 对象
FolderBrowserDialog ofd = new FolderBrowserDialog();
//打开时指定默认路径
ofd.SelectedPath = @"C:\Documents and Settings\Administrator.ICBCOA-6E96E6BE\桌面";
//如果用户点击确定
if (ofd.ShowDialog() == DialogResult.OK)
{
//将用户选择的文件路径 显示 在文本框中
txtFilePathOpen.Text = ofd.SelectedPath;
}
文本框中输入小说的标识 选中下载路径点击下载就可以了

完整代码
TextHelper.cs

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace WindowsFormsApp1
{
class TextHelper
{
/// <summary>
/// 创建文本
/// </summary>
/// <param name="content">内容</param>
/// <param name="name">名字</param>
/// <param name="path">路径</param>
public void Novel(string content, string name, string path)
{
string Log = content + "\r\n";
// 创建文件夹,如果不存在就创建file文件夹
if (Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}

// 判断文件是否存在,不存在则创建
if (!System.IO.File.Exists(path + "\\"+ name + ".txt"))
{
FileStream fs1 = new FileStream(path +"\\"+ name + ".txt", FileMode.Create, FileAccess.Write);// 创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);// 开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = new FileStream(path + "\\"+name + ".txt" + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);// 开始写入值
sr.Close();
fs.Close();
}
}

public string HttpPost(string Url, string postDataStr)
{
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();

return retString;
}

public string HttpGet(string Url, string postDataStr)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "GET";
//request.ContentType = "text/html;charset=GBK";
HttpWebResponse response;
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)request.GetResponse();
}

Stream stm = new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
StreamReader myStreamReader = new StreamReader(stm, Encoding.GetEncoding("GBK"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();

return retString;
}
}
}

Form1

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WindowsFormsApp1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{

}
public static string PathLog = "";
private void button1_Click(object sender, EventArgs e)
{
//选择文件框 对象
FolderBrowserDialog ofd = new FolderBrowserDialog();
//打开时指定默认路径
ofd.SelectedPath = @"C:\Documents and Settings\Administrator.ICBCOA-6E96E6BE\桌面";
//如果用户点击确定
if (ofd.ShowDialog() == DialogResult.OK)
{
//将用户选择的文件路径 显示 在文本框中
txtFilePathOpen.Text = ofd.SelectedPath;
PathLog = ofd.SelectedPath;
}
}

private void button2_Click(object sender, EventArgs e)
{
var path = PathLog;
//是否读取小说名称
var IsText = true;
//小说内容
var Content = "";
//小说标题
var text = "";
var novelid = textName.Text.ToString();
//抓取整本小说
//Thread thread = new Thread(() =>
//{
for (var i = 1; i < 2000; i++)
{

TextHelper cra = new TextHelper();
try
{
string html = cra.HttpGet("http://www.jjwxc.net/onebook.php?novelid=" + novelid + "&chapterid=" + i, "");

if (IsText)
{
//获取小说名字
Regex ma_name = new Regex(@"<input type=""hidden"" id=""novel_name"" value=""(.+)""/>(.|\n)*?");
var mu_name = ma_name.Match(html);
text = mu_name.Groups[1].Value;IsText = false;
}

//小说章节
Regex reg_zjm = new Regex(@"<h2>(.|\n)*?</h2>");
var mat_zjm = reg_zjm.Match(html);
string zjm = mat_zjm.Groups[0].ToString();
zjm = zjm.Replace("<h2>", " ");
zjm = zjm.Replace("</h2>", " ");
if (zjm == "")
{
continue;
}

// 小说内容
Regex reg_mulu = new Regex(@"<br>(.|\n)*?</div></td>");
var mat_mulu = reg_mulu.Match(html);
string nrong = mat_mulu.Groups[0].ToString();
// <br><br/>替换为换行
nrong = nrong.Replace("<br>", "\r\n");
// 去掉其它的<>之间的东西
nrong = nrong.Split('d')[0];
nrong = nrong.Replace("<", "");
Content = "第" + i + "章" + zjm + "\r\n" + nrong + "\r\n";
}
catch
{
}
// txt文本输出
cra.Novel(Content, text, path);
}

//});
//thread.IsBackground = true;
//thread.Start();
}

private void textBox1_TextChanged(object sender, EventArgs e)
{

}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐