var http = require('http');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var iconv = require('iconv-lite');
iconv.skipDecodeWarning = true; //Warning output omitted
var options = {
uri: 'http://cy.cyworld.com/home/ALG5LB24',
headers: {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; IPMS/A640400A-14D460801A1-000000426571; TCO_20110131100426; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; Tablet PC 2.0)'
},
Encoding: There's 'binary'
};
request(options, function (err, response, body) {
var decode_url = iconv.decode(body, 'utf-8');
var $ = cheerio.load(decode_url);
console.log(decode_url);
});
The source above is the source that uses the Cheerio module to scratch the URL and float it in the console window. But in other sites, all the ingredients are scratched, so you can access the internal elements Strangely, I can't scratch a few div classes just for Cyworld. (To be exact, the div is scratched, but the internal ingredients are not scratched.) ex) If you access the div class postArea from the scraped url and output only the div class approached, There are a lot of ingredients inside, but it only pops up like this. )
I think Cyworld blocked it, so should we use another library to solve it?
node.js crawler scrapy
The internal content of the div#postArea
element appears to be DOM content dynamically generated by JavaScript on the client (browser). The HTML that comes over HTTP primarily comes over in the form of an empty container like <divid="postArea"></div>
and is therefore not accessible to the internal element.
The solution is to use a library that runs and renders JavaScript on the server, such as PhantomJS (
I think postArea is empty in the basic template.
Probably. After loading the default template, it seems to work by importing content that will be rendered into the postArea via JavaScript (web browser).
If you check the request on the page you told me, the following request pattern seems to be the part that brings the contents of the post area.
http://cy.cyworld.com/home/ALG5LB24/postlist?startdate=&enddate=&folderid=&tagname=&_=1459310094342
© 2024 OneMinuteCode. All rights reserved.