(终于找到了获取有效_signature的方法)博客搬家系列(六)-爬取今日头条文章(二)
(终于找到了获取有效_signature的方法)博客搬家系列(六)-爬取今日头条文章(二)
一.前情回顾
博客搬家系列(六)-爬取今日头条文章:https://blog.csdn.net/rico_zhou/article/details/83619564
上回我们说到了使用java htmlunit爬取今日头条的文章列表难度很大,关键在于_signature这个参数的加密算法,经过百度查询也发现了大家大多数都是使用Python selenium来获取,但是需要安装浏览器和浏览器驱动,这不是我们想要的,并且我们也测试出了以下几点,
1.使用网上找到的转化版js是可以获取到_signature的,但是只能在浏览器中打开此html获取的才能用,而使用htmlunit爬取本地html文件得到的却不能使用。
2.直接执行今日头条的TAC.sign()方法获取到的参数依然无法使用,由于每次获取的参数都是不一样的,也无法判断到底是缺少什么东西。
基于以上两点我们开始今天的尝试(其实就是昨天的事儿,不甘心那!)
二.整体分析
首先上一下此html:
[code]<html> <head></head> <body> <input id="as"> <input id="cp"> <input id="_signature"> <input id="user_id" value="50080767248"> <input id="max_behot_time" value="0"> <textarea id="textarea1"></textarea> <script> var user_id=document.getElementById('user_id').value; var max_behot_time=document.getElementById('max_behot_time').value; getHoney(user_id,max_behot_time); function getHoney(user_id,max_behot_time){ function t1(x1,x2,x3){ return x1+x3+x2; } function t2(x1,x2,x3){ return x1+x3+x2; } function e(e, a, r) { //console.log(2222222+" "+e) //console.log(3333333+" "+a) //console.log(4444444+" "+r) var aa=((b[e] = t("x,y", "return x " + e + " y")))(r, a); return aa //return (b[e] || (b[e] = t1))(r, a) } function a(e, a, r) { //console.log(2222222+" "+e) //console.log(3333333+" "+a) //console.log(4444444+" "+r) //var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a); //console.log(5555555+" "+kk) kk='Fri Nov 02 2017 10:41:59 GMT+0800 (中国标准时间)' return kk; } function r(e, a, r) { var n, t, s = {}, b = s.d = r ? r.d + 1 : 0; for (s["$" + b] = s, t = 0; t < b; t++) s[n = "$" + t] = r ; for (t = 0, b = s.length = a.length; t < b; t++) s[t] = a[t]; return c(e, 0, s) } function c(t, b, k) { function u(e) { v[x++] = e } function f() { return g = t.charCodeAt(b++) - 32, t.substring(b, b += g) } function l() { try { y = c(t, b, k) } catch (e) { h = e, y = l } } for (var h, y, d, g, v = [], x = 0;;) switch (g = t.charCodeAt(b++) - 32) { case 1: u(!v[--x]); break; case 4: v[x++] = f(); break; case 5: u(function (e) { var a = 0, r = e.length; return function () { var c = a < r; return c && u(e[a++]), c } }(v[--x])); break; case 6: y = v[--x], u(v[--x](y)); break; case 8: if (g = t.charCodeAt(b++) - 32, l(), b += g, g = t.charCodeAt(b++) - 32, y === c) b += g; else if (y !== l) return y; break; case 9: v[x++] = c; break; case 10: u(s(v[--x])); break; case 11: y = v[--x], u(v[--x] + y); break; case 12: for (y = f(), d = [], g = 0; g < y.length; g++) d[g] = y.charCodeAt(g) ^ g + y.length; u(String.fromCharCode.apply(null, d)); break; case 13: y = v[--x], h = delete v[--x][y]; break; case 14: v[x++] = t.charCodeAt(b++) - 32; break; case 59: u((g = t.charCodeAt(b++) - 32) ? (y = x, v.slice(x -= g, y)) : []); break; case 61: u(v[--x][t.charCodeAt(b++) - 32]); break; case 62: g = v[--x], k[0] = 65599 * k[0] + k[1].charCodeAt(g) >>> 0; break; case 65: h = v[--x], y = v[--x], v[--x][y] = h; break; case 66: u(e(t.substr(b++, 1), v[--x], v[--x])); break; case 67: y = v[--x]; d = v[--x]; g = v[--x]; u(g.x === c ? r(g.y, y, k) : g.apply(d, y)); break; case 68: u(e((g = t.substr(b++, 1)) < "<" ? (b--, f()) : g + g, v[--x], v[--x])); break; case 70: u(!1); break; case 71: v[x++] = n; break; case 72: v[x++] = +f(); break; case 73: u(parseInt(f(), 36)); break; case 75: if (v[--x]) { b++; break } case 74: g = t.charCodeAt(b++) - 32 << 16 >> 16, b += g; break; case 76: u(k[t.charCodeAt(b++) - 32]); break; case 77: y = v[--x], u(v[--x][y]); break; case 78: g = t.charCodeAt(b++) - 32, u(a(v, x -= g + 1, g)); break; case 79: g = t.charCodeAt(b++) - 32, u(k["$" + g]); break; case 81: h = v[--x], v[--x][f()] = h; break; case 82: u(v[--x][f()]); break; case 83: h = v[--x], k[t.charCodeAt(b++) - 32] = h; break; case 84: v[x++] = !0; break; case 85: v[x++] = void 0; break; case 86: u(v[x - 1]); break; case 88: h = v[--x], y = v[--x], v[x++] = h, v[x++] = y; break; case 89: u(function () { function e() { return r(e.y, arguments, k) } return e.y = f(), e.x = c, e }()); break; case 90: v[x++] = null; break; case 91: v[x++] = h; break; case 93: h = v[--x]; break; case 0: return v[--x]; default: u((g << 16 >> 16) - 16) } } var n = window; //document.getElementById('textarea1').value=JSON.stringify(window); var t = n.Function, s = Object.keys || function (e) { var a = {}, r = 0; for (var c in e) a[r++] = c; return a.length = r, a }, b = {}, k = {}; var rrr= decodeURIComponent("gr%24Daten%20%D0%98b%2Fs!l%20y%CD%92y%C4%B9g%2C(lfi~ah%60%7Bmv%2C-n%7CjqewVxp%7Brvmmx%2C%26eff%7Fkx%5B!cs%22l%22.Pq%25widthl%22%40q%26heightl%22vr*getContextx%24%222d%5B!cs%23l%23%2C*%3B%3F%7Cu.%7Cuc%7Buq%24fontl%23vr(fillTextx%24%24%E9%BE%98%E0%B8%91%E0%B8%A0%EA%B2%BD2%3C%5B%23c%7Dl%232q*shadowBlurl%231q-shadowOffsetXl%23%24%24limeq%2BshadowColorl%23vr%23arcx88802%5B%25c%7Dl%23vr%26strokex%5B%20c%7Dl%22v%2C)%7DeOmyoZB%5Dmx%5B%20cs!0s%24l%24Pb%3Ck7l%20l!r%26lengthb%25%5El%241%2Bs%24j%02l%20%20s%23i%241ek1s%24gr%23tack4)zgr%23tac%24!%20%2B0o!%5B%23cj%3Fo%20%5D!l%24b%25s%22o%20%5D!l%22l%24b*b%5E0d%23%3E%3E%3Es!0s%25yA0s%22l%22l!r%26lengthb%3Ck%2Bl%22%5El%221%2Bs%22j%05l%20%20s%26l%26z0l!%24%20%2B%5B%22cs'(0l%23i'1ps9wxb%26s()%20%26%7Bs)%2Fs(gr%26Stringr%2CfromCharCodes)0s*yWl%20._b%26s%20o!%5D)l%20l%20Jb%3Ck%24.aj%3Bl%20.Tb%3Ck%24.gj%2Fl%20.%5Eb%3Ck%26i%22-4j!%1F%2B%26%20s%2ByPo!%5D%2Bs!l!l%20Hd%3E%26l!l%20Bd%3E%26%2Bl!l%20%3Cd%3E%26%2Bl!l%206d%3E%26%2Bl!l%20%26%2B%20s%2Cy%3Do!o!%5D%2Fq%2213o!l%20q%2210o!%5D%2Cl%202d%3E%26%20s.%7Bs-yMo!o!%5D0q%2213o!%5D*Ld%3Cl%204d%23%3E%3E%3Eb%7Cs!o!l%20q%2210o!%5D%2Cl!%26%20s%2FyIo!o!%5D.q%2213o!%5D%2Co!%5D*Jd%3Cl%206d%23%3E%3E%3Eb%7C%26o!%5D%2Bl%20%26%2B%20s0l-l!%26l-l!i'1z141z4b%2F%40d%3Cl%22b%7C%26%2Bl-l(l!b%5E%26%2Bl-l%26zl'g%2C)gk%7Dejo%7B%7Fcm%2C)%7Cyn~Lij~em%5B%22cl%24b%25%40d%3Cl%26zl'l%20%24%20%2B%5B%22cl%24b%25b%7C%26%2Bl-l%258d%3C%40b%7Cl!b%5E%26%2B%20q%24sign%20"); r(decodeURIComponent("gr%24Daten%20%D0%98b%2Fs!l%20y%CD%92y%C4%B9g%2C(lfi~ah%60%7Bmv%2C-n%7CjqewVxp%7Brvmmx%2C%26eff%7Fkx%5B!cs%22l%22.Pq%25widthl%22%40q%26heightl%22vr*getContextx%24%222d%5B!cs%23l%23%2C*%3B%3F%7Cu.%7Cuc%7Buq%24fontl%23vr(fillTextx%24%24%E9%BE%98%E0%B8%91%E0%B8%A0%EA%B2%BD2%3C%5B%23c%7Dl%232q*shadowBlurl%231q-shadowOffsetXl%23%24%24limeq%2BshadowColorl%23vr%23arcx88802%5B%25c%7Dl%23vr%26strokex%5B%20c%7Dl%22v%2C)%7DeOmyoZB%5Dmx%5B%20cs!0s%24l%24Pb%3Ck7l%20l!r%26lengthb%25%5El%241%2Bs%24j%02l%20%20s%23i%241ek1s%24gr%23tack4)zgr%23tac%24!%20%2B0o!%5B%23cj%3Fo%20%5D!l%24b%25s%22o%20%5D!l%22l%24b*b%5E0d%23%3E%3E%3Es!0s%25yA0s%22l%22l!r%26lengthb%3Ck%2Bl%22%5El%221%2Bs%22j%05l%20%20s%26l%26z0l!%24%20%2B%5B%22cs'(0l%23i'1ps9wxb%26s()%20%26%7Bs)%2Fs(gr%26Stringr%2CfromCharCodes)0s*yWl%20._b%26s%20o!%5D)l%20l%20Jb%3Ck%24.aj%3Bl%20.Tb%3Ck%24.gj%2Fl%20.%5Eb%3Ck%26i%22-4j!%1F%2B%26%20s%2ByPo!%5D%2Bs!l!l%20Hd%3E%26l!l%20Bd%3E%26%2Bl!l%20%3Cd%3E%26%2Bl!l%206d%3E%26%2Bl!l%20%26%2B%20s%2Cy%3Do!o!%5D%2Fq%2213o!l%20q%2210o!%5D%2Cl%202d%3E%26%20s.%7Bs-yMo!o!%5D0q%2213o!%5D*Ld%3Cl%204d%23%3E%3E%3Eb%7Cs!o!l%20q%2210o!%5D%2Cl!%26%20s%2FyIo!o!%5D.q%2213o!%5D%2Co!%5D*Jd%3Cl%206d%23%3E%3E%3Eb%7C%26o!%5D%2Bl%20%26%2B%20s0l-l!%26l-l!i'1z141z4b%2F%40d%3Cl%22b%7C%26%2Bl-l(l!b%5E%26%2Bl-l%26zl'g%2C)gk%7Dejo%7B%7Fcm%2C)%7Cyn~Lij~em%5B%22cl%24b%25%40d%3Cl%26zl'l%20%24%20%2B%5B%22cl%24b%25b%7C%26%2Bl-l%258d%3C%40b%7Cl!b%5E%26%2B%20q%24sign%20"), [TAC = {}]); tt = TAC.sign(user_id+"" + max_behot_time); var i = Math.floor((new Date).getTime() / 1e3) , e = i.toString(16).toUpperCase() , t = md5(i).toString().toUpperCase(); if (8 != e.length) return { as: "479BB4B7254C150", cp: "7E0AC8874BB0985" }; for (var n = t.slice(0, 5), o = t.slice(-5), s = "", a = 0; 5 > a; a++) { s += n[a] + e[a];} for (var l = "", r = 0; 5 > r; r++){ l += e[r + 3] + o[r]; } var as="A1" + s + e.slice(-3); var cp=e.slice(0, 3) + l + "E1"; document.getElementById('as').value=as; document.getElementById('cp').value=cp; document.getElementById('_signature').value=tt; console.log(tt.substring(18,19)) return { as: as, cp: cp, _signature:tt } } function md5(string) { function md5_RotateLeft(lValue, iShiftBits) { return (lValue << iShiftBits) | (lValue >>> (32 - iShiftBits)); } function md5_AddUnsigned(lX, lY) { var lX4, lY4, lX8, lY8, lResult; lX8 = (lX & 0x80000000); lY8 = (lY & 0x80000000); lX4 = (lX & 0x40000000); lY4 = (lY & 0x40000000); lResult = (lX & 0x3FFFFFFF) + (lY & 0x3FFFFFFF); if (lX4 & lY4) { return (lResult ^ 0x80000000 ^ lX8 ^ lY8); } if (lX4 | lY4) { if (lResult & 0x40000000) { return (lResult ^ 0xC0000000 ^ lX8 ^ lY8); } else { return (lResult ^ 0x40000000 ^ lX8 ^ lY8); } } else { return (lResult ^ lX8 ^ lY8); } } function md5_F(x, y, z) { return (x & y) | ((~x) & z); } function md5_G(x, y, z) { return (x & z) | (y & (~z)); } function md5_H(x, y, z) { return (x ^ y ^ z); } function md5_I(x, y, z) { return (y ^ (x | (~z))); } function md5_FF(a, b, c, d, x, s, ac) { a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_F(b, c, d), x), ac)); return md5_AddUnsigned(md5_RotateLeft(a, s), b); }; function md5_GG(a, b, c, d, x, s, ac) { a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_G(b, c, d), x), ac)); return md5_AddUnsigned(md5_RotateLeft(a, s), b); }; function md5_HH(a, b, c, d, x, s, ac) { a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_H(b, c, d), x), ac)); return md5_AddUnsigned(md5_RotateLeft(a, s), b); }; function md5_II(a, b, c, d, x, s, ac) { a = md5_AddUnsigned(a, md5_AddUnsigned(md5_AddUnsigned(md5_I(b, c, d), x), ac)); return md5_AddUnsigned(md5_RotateLeft(a, s), b); }; function md5_ConvertToWordArray(string) { var lWordCount; var lMessageLength = string.length; var lNumberOfWords_temp1 = lMessageLength + 8; var lNumberOfWords_temp2 = (lNumberOfWords_temp1 - (lNumberOfWords_temp1 % 64)) / 64; var lNumberOfWords = (lNumberOfWords_temp2 + 1) * 16; var lWordArray = Array(lNumberOfWords - 1); var lBytePosition = 0; var lByteCount = 0; while (lByteCount < lMessageLength) { lWordCount = (lByteCount - (lByteCount % 4)) / 4; lBytePosition = (lByteCount % 4) * 8; lWordArray[lWordCount] = (lWordArray[lWordCount] | (string.charCodeAt(lByteCount) << lBytePosition)); lByteCount++; } lWordCount = (lByteCount - (lByteCount % 4)) / 4; lBytePosition = (lByteCount % 4) * 8; lWordArray[lWordCount] = lWordArray[lWordCount] | (0x80 << lBytePosition); lWordArray[lNumberOfWords - 2] = lMessageLength << 3; lWordArray[lNumberOfWords - 1] = lMessageLength >>> 29; return lWordArray; }; function md5_WordToHex(lValue) { var WordToHexValue = "", WordToHexValue_temp = "", lByte, lCount; for (lCount = 0; lCount <= 3; lCount++) { lByte = (lValue >>> (lCount * 8)) & 255; WordToHexValue_temp = "0" + lByte.toString(16); WordToHexValue = WordToHexValue + WordToHexValue_temp.substr(WordToHexValue_temp.length - 2, 2); } return WordToHexValue; }; function md5_Utf8Encode(string) { string = string.toString().replace(/\r\n/g, "\n"); var utftext = ""; for (var n = 0; n < string.length; n++) { var c = string.charCodeAt(n); if (c < 128) { utftext += String.fromCharCode(c); } else if ((c > 127) && (c < 2048)) { utftext += String.fromCharCode((c >> 6) | 192); utftext += String.fromCharCode((c & 63) | 128); } else { utftext += String.fromCharCode((c >> 12) | 224); utftext += String.fromCharCode(((c >> 6) & 63) | 128); utftext += String.fromCharCode((c & 63) | 128); } } return utftext; }; var x = Array(); var k, AA, BB, CC, DD, a, b, c, d; var S11 = 7, S12 = 12, S13 = 17, S14 = 22; var S21 = 5, S22 = 9, S23 = 14, S24 = 20; var S31 = 4, S32 = 11, S33 = 16, S34 = 23; var S41 = 6, S42 = 10, S43 = 15, S44 = 21; string = md5_Utf8Encode(string); x = md5_ConvertToWordArray(string); a = 0x67452301; b = 0xEFCDAB89; c = 0x98BADCFE; d = 0x10325476; for (k = 0; k < x.length; k += 16) { AA = a; BB = b; CC = c; DD = d; a = md5_FF(a, b, c, d, x[k + 0], S11, 0xD76AA478); d = md5_FF(d, a, b, c, x[k + 1], S12, 0xE8C7B756); c = md5_FF(c, d, a, b, x[k + 2], S13, 0x242070DB); b = md5_FF(b, c, d, a, x[k + 3], S14, 0xC1BDCEEE); a = md5_FF(a, b, c, d, x[k + 4], S11, 0xF57C0FAF); d = md5_FF(d, a, b, c, x[k + 5], S12, 0x4787C62A); c = md5_FF(c, d, a, b, x[k + 6], S13, 0xA8304613); b = md5_FF(b, c, d, a, x[k + 7], S14, 0xFD469501); a = md5_FF(a, b, c, d, x[k + 8], S11, 0x698098D8); d = md5_FF(d, a, b, c, x[k + 9], S12, 0x8B44F7AF); c = md5_FF(c, d, a, b, x[k + 10], S13, 0xFFFF5BB1); b = md5_FF(b, c, d, a, x[k + 11], S14, 0x895CD7BE); a = md5_FF(a, b, c, d, x[k + 12], S11, 0x6B901122); d = md5_FF(d, a, b, c, x[k + 13], S12, 0xFD987193); c = md5_FF(c, d, a, b, x[k + 14], S13, 0xA679438E); b = md5_FF(b, c, d, a, x[k + 15], S14, 0x49B40821); a = md5_GG(a, b, c, d, x[k + 1], S21, 0xF61E2562); d = md5_GG(d, a, b, c, x[k + 6], S22, 0xC040B340); c = md5_GG(c, d, a, b, x[k + 11], S23, 0x265E5A51); b = md5_GG(b, c, d, a, x[k + 0], S24, 0xE9B6C7AA); a = md5_GG(a, b, c, d, x[k + 5], S21, 0xD62F105D); d = md5_GG(d, a, b, c, x[k + 10], S22, 0x2441453); c = md5_GG(c, d, a, b, x[k + 15], S23, 0xD8A1E681); b = md5_GG(b, c, d, a, x[k + 4], S24, 0xE7D3FBC8); a = md5_GG(a, b, c, d, x[k + 9], S21, 0x21E1CDE6); d = md5_GG(d, a, b, c, x[k + 14], S22, 0xC33707D6); c = md5_GG(c, d, a, b, x[k + 3], S23, 0xF4D50D87); b = md5_GG(b, c, d, a, x[k + 8], S24, 0x455A14ED); a = md5_GG(a, b, c, d, x[k + 13], S21, 0xA9E3E905); d = md5_GG(d, a, b, c, x[k + 2], S22, 0xFCEFA3F8); c = md5_GG(c, d, a, b, x[k + 7], S23, 0x676F02D9); b = md5_GG(b, c, d, a, x[k + 12], S24, 0x8D2A4C8A); a = md5_HH(a, b, c, d, x[k + 5], S31, 0xFFFA3942); d = md5_HH(d, a, b, c, x[k + 8], S32, 0x8771F681); c = md5_HH(c, d, a, b, x[k + 11], S33, 0x6D9D6122); b = md5_HH(b, c, d, a, x[k + 14], S34, 0xFDE5380C); a = md5_HH(a, b, c, d, x[k + 1], S31, 0xA4BEEA44); d = md5_HH(d, a, b, c, x[k + 4], S32, 0x4BDECFA9); c = md5_HH(c, d, a, b, x[k + 7], S33, 0xF6BB4B60); b = md5_HH(b, c, d, a, x[k + 10], S34, 0xBEBFBC70); a = md5_HH(a, b, c, d, x[k + 13], S31, 0x289B7EC6); d = md5_HH(d, a, b, c, x[k + 0], S32, 0xEAA127FA); c = md5_HH(c, d, a, b, x[k + 3], S33, 0xD4EF3085); b = md5_HH(b, c, d, a, x[k + 6], S34, 0x4881D05); a = md5_HH(a, b, c, d, x[k + 9], S31, 0xD9D4D039); d = md5_HH(d, a, b, c, x[k + 12], S32, 0xE6DB99E5); c = md5_HH(c, d, a, b, x[k + 15], S33, 0x1FA27CF8); b = md5_HH(b, c, d, a, x[k + 2], S34, 0xC4AC5665); a = md5_II(a, b, c, d, x[k + 0], S41, 0xF4292244); d = md5_II(d, a, b, c, x[k + 7], S42, 0x432AFF97); c = md5_II(c, d, a, b, x[k + 14], S43, 0xAB9423A7); b = md5_II(b, c, d, a, x[k + 5], S44, 0xFC93A039); a = md5_II(a, b, c, d, x[k + 12], S41, 0x655B59C3); d = md5_II(d, a, b, c, x[k + 3], S42, 0x8F0CCC92); c = md5_II(c, d, a, b, x[k + 10], S43, 0xFFEFF47D); b = md5_II(b, c, d, a, x[k + 1], S44, 0x85845DD1); a = md5_II(a, b, c, d, x[k + 8], S41, 0x6FA87E4F); d = md5_II(d, a, b, c, x[k + 15], S42, 0xFE2CE6E0); c = md5_II(c, d, a, b, x[k + 6], S43, 0xA3014314); b = md5_II(b, c, d, a, x[k + 13], S44, 0x4E0811A1); a = md5_II(a, b, c, d, x[k + 4], S41, 0xF7537E82); d = md5_II(d, a, b, c, x[k + 11], S42, 0xBD3AF235); c = md5_II(c, d, a, b, x[k + 2], S43, 0x2AD7D2BB); b = md5_II(b, c, d, a, x[k + 9], S44, 0xEB86D391); a = md5_AddUnsigned(a, AA); b = md5_AddUnsigned(b, BB); c = md5_AddUnsigned(c, CC); d = md5_AddUnsigned(d, DD); } return (md5_WordToHex(a) + md5_WordToHex(b) + md5_WordToHex(c) + md5_WordToHex(d)).toLowerCase(); } </script> </body> </html>
基于上面分析的第二点,我们暂时是可以看看这个算法到底是怎样(核心代码来源于网上,暂时忘了url找到后后补上),本人js水,大概也就注意到了以下几点:
1.直接将js部分用nodejs执行肯定是不行的,因为代码中出现了window对象,且使用了window.Function构建动态函数,没办法,不会将其构造成纯js运行,那么我们的目标就是将直接打开html获取的参数param1,和使用htmlunit读取本地html文件获取的参数param2比对,找出其中的规律,让其一样或者都生效(param1是生效的)。
2.根据代码中构建函数t=window.Function,我们找到了两个相关的使用地方:
[code]function e(e, a, r) { return ((b[e] = t("x,y", "return x " + e + " y")))(r, a); } function a(e, a, r) { return (k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a); }
说实话,只知道是构建函数但是具体干嘛的还是不知道,咋办呢?那就console.log()输出一下呗,改一下:
[code]function e(e, a, r) { console.log(2222222+" "+e) console.log(3333333+" "+a) console.log(4444444+" "+r) var aa=((b[e] = t("x,y", "return x " + e + " y")))(r, a); return aa //return (b[e] || (b[e] = t1))(r, a) } function a(e, a, r) { console.log(2222222+" "+e) console.log(3333333+" "+a) console.log(4444444+" "+r) var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a); console.log(5555555+" "+kk) return kk; }
经过多次打开运行验证,发现函数e运行了很多次,而且也看不出啥规律,但是函数a却只运行了一次
注意是55555开头的,显然这是时间戳,而且只出现一次,那么我就先将其写死,毕竟这个是可以自行获取不再需要加密了,
[code] function a(e, a, r) { console.log(2222222+" "+e) console.log(3333333+" "+a) console.log(4444444+" "+r) var kk=(k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a); console.log(5555555+" "+kk) //先将其写死 kk='Fri Nov 02 2017 10:41:59 GMT+0800 (中国标准时间)' return kk; }
再次运行
结果:AAAAAAAAAABXN0hbnCxxawAAAB
多次运行,发现参数没有变化,而且参数变得简洁了,改变一下时间值发现还是没有变化,说明此参数的变化是只跟user_id,max_behot_time有关系,若固定则参数值固定,至于为什么没改之前却一直变化,大概是k这个函数变化吧,不管了,那么先看看能不能用,将对应的user_id和max_behot_time带入,发现确实可以使用,此时欣喜若狂,既然页面的参数param1固定了而且也可以使用了,那就和使用htmlunit获取的参数param2好比对了,
赶紧使用htmlunit读取一下本地刚刚的html
[code]public static void test3() throws Exception { String urlOne = "file:///C:/Users/rzhou6/Desktop/toutiao/newd.html"; // 模拟浏览器操作 // 创建WebClient WebClient webClient = new WebClient(BrowserVersion.CHROME); // 关闭css代码功能 webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setCssEnabled(false); // 如若有可能找不到文件js则加上这句代码 webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); HtmlPage page2 = webClient.getPage(urlOne); System.out.println(page2.asText()); }
结果:AAAAAAAAAAAYyN2oFg9xawAAAB
简单比对一下
param1:AAAAAAAAAABXN0hbnCxxawAAAB(有效)
param2:AAAAAAAAAAAYyN2oFg9xawAAAB(无效)
中间的几位不同,为了能找到规律,那就得多测试,选择不同的user_id,每个id不同的页面请求,并将其参数分割,数据如下:
[code]//同一id 101528687217 2,3,4页,网页可行,htmlunit本地文件不可行 1539419885 >AAAAAAAAAA BXN0hbnC x FHA AAAB >AAAAAAAAAA AYyN2oFg 9 FHA AAAB 1537250643 >AAAAAAAAAA BXN0hbnC x Mew AAAB >AAAAAAAAAA AYyN2oFg 9 Mew AAAB 1536065316 >AAAAAAAAAA BXN0hbnC w -ig AAAB >AAAAAAAAAA AYyN2oFg 8 -ig AAAB usrid 50080767248 1540553612 >AAAAAAAAAA BXN0hbnC x 2Gg AAAB >AAAAAAAAAA AYyN2oFg 9 2Gg AAAB 1540133733 >AAAAAAAAAA BXN0hbnC w ipA AAAB >AAAAAAAAAA AYyN2oFg 8 ipA AAAB 1539776774 >AAAAAAAAAA BXN0hbnC x RsQ AAAB >AAAAAAAAAA AYyN2oFg 9 RsQ AAAB 1539406769 >AAAAAAAAAA BXN0hbnC z Swg AAAB >AAAAAAAAAA AYyN2oFg . Swg AAAB 1538986022 >AAAAAAAAAA BXN0hbnC z qDw AAAB >AAAAAAAAAA AYyN2oFg . qDw AAAB 1538388819 >AAAAAAAAAA BXN0hbnC y jAg AAAB >AAAAAAAAAA AYyN2oFg - jAg AAAB
上一行为页面浏览器获取的参数param1,下面的是使用htmlunit读取html获取的参数param2,多方比对发现,不管是不是同id或者第几页,所有有效的param1中间部分BXN0hbnC,在参数param2中都变成了AYyN2oFg,好办,那我们只要反向替换一下即可,接下来就只有第19位字符不一样,接下来的都是一样的,猜测加密算法中对max_behot_time的值进行加密然后获取新值,暂时我们是可以发现x对应9,w对应8,z对应.,y对应-,其他的暂时未知,只要我们知道了第19位字符的对应规律就能反向替换,最终通过htmlunit读取本地html就能获取到有效的参数_signature,现在更改一下html文件,我们循环一下max_behot_time看看参数第19位都有哪些值出现
[code]for(var i=1000000000;i<1000001000;i++){ getHoney(user_id,i); }
注意i值不要过大,循环次数不要过多,不然容易卡死,
运行
多次改变i初始值发现,只有y,z,w,x这四个值出现那么我们只需要替换他即可
接下来写代码
[code]/** * @date Oct 31, 2018 3:59:49 PM * @Desc 获取文章list请求url * @param blogMove * @param num * @param max_behot_time * @return * @throws IOException * @throws MalformedURLException * @throws FailingHttpStatusCodeException */ public static String getTouTiaoListUrl(Blogmove blogMove, int num, String max_behot_time) throws Exception { String oneUrl = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s"; String user_id = blogMove.getMoveUserId(); // System.out.println(user_id); String as = ""; String cp = ""; String _signature = ""; //更改文件 updateHtmlFile("C:/Users/rzhou6/Desktop/toutiao/newd.html",user_id,max_behot_time); String urlOne = "file:///C:/Users/rzhou6/Desktop/toutiao/newd.html"; // 模拟浏览器操作 // 创建WebClient WebClient webClient = new WebClient(BrowserVersion.CHROME); // 关闭css代码功能 webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setCssEnabled(false); // 如若有可能找不到文件js则加上这句代码 webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); HtmlPage page2 = webClient.getPage(urlOne); System.out.println(page2.asText()); // 执行js as = page2.getElementById("as").asText(); cp = page2.getElementById("cp").asText(); _signature = page2.getElementById("_signature").asText(); System.out.println(_signature); _signature = getRightSign(_signature); System.out.println(as); System.out.println(cp); System.out.println(_signature); oneUrl = String.format(oneUrl, user_id, max_behot_time, as, cp, _signature); System.out.println(oneUrl); return oneUrl; } /** * @date Nov 2, 2018 12:36:27 PM * @Desc * @param string * @param max_behot_time * @param user_id */ private static void updateHtmlFile(String string, String user_id, String max_behot_time) { String html=FileUtils.getFileToString(string); Document doc = Jsoup.parse(html); Element imgTags = doc.getElementById("user_id"); imgTags.attr("value",user_id); Element imgTags2 = doc.getElementById("max_behot_time"); imgTags2.attr("value",max_behot_time); //写入文件 new File(string).delete(); FileUtils.appendFile(string, doc.html()); } /** * @date Nov 2, 2018 12:24:42 PM * @Desc * @param _signature * @return */ private static String getRightSign(String _signature) { // w:8,x:9,y:-,z:. // >AAAAAAAAAA BXN0hbnC y jAQ AAAB // >AAAAAAAAAA AYyN2oFg - jAQ AAAB String s = _signature.substring(18, 19); String ss = _signature.substring(19, 22); if ("8".equals(s)) { s = "w"; } else if ("9".equals(s)) { s = "x"; } else if ("-".equals(s)) { s = "y"; } else if (".".equals(s)) { s = "z"; } return "AAAAAAAAAABXN0hbnC" + s + ss + "AAAB"; }
经验证发现,所得的url均可用
PS:其中最后一步出现了不小的波折,在使用htmlunit时突然获取的参数跟测试时不一样了,规律也不一样,但是代码是完全一样的啊,经过对比,发现了是jar包版本的问题,真是奇怪,也没有任何冲突,总之获取的就是不一样,大概这也是期初两个参数获取不一样的原因吧,毕竟htmlunit是模拟而不是实实在在浏览器,htmlunit使用2.27版本即可,使用2.32版本获取的参数规律不再是上文所说了。
PPS:最后又发现了问题,虽然获取的url浏览器是完全可以获取到json数据的,但是htmlunit发送此get请求时,居然偶尔可行,偶尔不行,估计是今日头条的反爬又有限制了,不过没关系,获取了正确的url害怕取不到数据?
欢迎交流学习!
完整源码请见github:https://github.com/ricozhou/blogmove
阅读更多- 终于找到如何获取积分的方法了
- 治疗鼻炎非常有效的方法——终于找到了
- 治疗鼻炎非常有效的方法——终于找到了
- GOD!我终于找到博客园设置博客文章图片签名的方法了!
- 如何获取外网Ip呢, 终于找到方法了
- 终于找到ASP.NET c# 简单的获取远端文件的方法!
- 获取有效外链的最好方法
- mysql 查询获取排名的方法(绝对有效)
- dede织梦栏目页和文章页中获取当前栏目名称方法
- J2EE系列之Struts2学习笔记(二)---使用get/set方法自动获取/设置参数值
- WordPress中用于获取文章作者与分类信息的方法整理
- php 获取今日、昨日、上周、本月的起始时间戳和结束时间戳的方法
- 推荐之三(批处理):xp一键更改IP与MAC地址(终于找到更快的方法)
- php获取文章上一页与下一页的方法
- php 获取今日、昨日、上周、本月的起始时间戳和结束时间戳的方法
- DEDE文章列表获取单篇文章TAGS解决方法
- 转载一篇张鑫旭大大的文章,我感觉不错!!【获取元素CSS值之getComputedStyle方法熟悉】
- (android高仿系列)今日头条 --新闻阅读器 (一)
- 前几天我的问题终于在网上找到方法了!