Javascript

Nodejs实现爬虫的几种方式

本文主要是介绍Nodejs实现爬虫的几种方式,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

获取代理 IP

// 需要安装 axios 模块 - npm install axios --save
const axios = require('axios')

// id secret 等参数是在猿人云官网提取API内获取的
const queries = {
    id: 'xxx',
    secret: 'xxx',
    limit: 1,
    format: 'txt',
    auth_mode: 'auto'
};

axios.get('http://tunnel-api.apeyun.com/q', {
    params: queries,
}).then((response) => {
    console.log('IP:', response.data);
}).catch((e) => {
    console.error('Error:', e);
});

爬虫程序

  • axios


     

    // 需要安装 axios 模块 - npm install axios --save
    const axios = require('axios')
    
    // 要访问的目标页面
    let targetUrl = "http://www.baidu.com"
    
    // 代理服务器,假设提取到的代理 ip 是 123.123.123.123:1234
    const proxyHost = "123.123.123.123"
    const proxyPort = 1234
    
    // 代理验证信息(猿人云官网获取)
    const proxyUser = "xxx"
    const proxyPass = "xxx"
    
    let proxy = {
        host: proxyHost,
        port: proxyPort,
        auth: {
            username: proxyUser,
            password: proxyPass
        }
    }
    
    // 参见官方文档 https://github.com/axios/axios#request-config
    axios.get(targetUrl,{proxy:proxy})
        .then(function (response) {
            // handle success
            console.log(response.data)
        })
        .catch(function (error) {
            // handle error
            console.log(error)
        })
        .finally(function () {
            // always executed
        });
    

      

  • http

  •  

  • const http = require("http")
    const url  = require("url")
    
    // 要访问的目标页面
    const targetUrl = "http://www.baidu.com"
    const urlParsed   = url.parse(targetUrl)
    
    // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
    const proxyHost = "123.123.123.123"
    const proxyPort = "1234"
    
    // 代理隧道验证信息(猿人云官网获取)
    const proxyUser = "xxx"
    const proxyPass = "xxx"
    
    const base64 = Buffer.from(proxyUser + ":" + proxyPass).toString("base64")
    
    const options = {
        host: proxyHost,
        port: proxyPort,
        path: targetUrl,
        method: "GET",
        headers: {
            "Host": urlParsed.hostname,
            "Proxy-Authorization": "Basic " + base64
        }
    }
    
    http.request(options, function(res) {
        console.log("got response: " + res.statusCode)
    })
    .on("error", function(err) {
        console.log(err)
    })
    .end()
    

      

  • request


     

  • // 需要安装 request 模块 - npm install request --save
    const request = require("request")
    
    // 要访问的目标页面
    const targetUrl = "http://www.baidu.com"
    
    // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
    const proxyHost = "123.123.123.123"
    const proxyPort = 1234
    
    // 代理隧道验证信息(猿人云官网获取)
    const proxyUser = "xxx"
    const proxyPass = "xxx"
    
    const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort
    
    const proxiedRequest = request.defaults({'proxy': proxyUrl})
    
    const options = {
        url: targetUrl,
        headers: {}
    }
    
    proxiedRequest.get(options, function (err, res, body) {
        console.log("got response: " + res.statusCode)
        console.log("got response: " + body)
    })
    .on("error", function (err) {
        console.log(err);
    })
    

      

  • superagent


     

     

  • // 需要安装 superagent 和 superagent-proxy 模块 - npm install superagent superagent-proxy --save
    const request = require("superagent")
    require("superagent-proxy")(request)
    
    // 要访问的目标页面
    const targetUrl = "http://www.baidu.com"
    
    // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
    const proxyHost = "123.123.123.123";
    const proxyPort = 1234;
    
    // 代理隧道验证信息(猿人云官网获取)
    const proxyUser = "xxx"
    const proxyPass = "xxx"
    
    const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort
    
    request.get(targetUrl).proxy(proxyUrl).end(function onResponse(err, res) {
        if (err) {
            return console.log(err)
        }
        console.log(res.status, res.headers)
        console.log(res.text)
    });

使用 node 运行这个文件,当你的控制台打印出一大段 HTML 代码说明这个爬虫程序成功了

 

转自:https://mp.weixin.qq.com/s/JA11NzbbHtKqgijmdmJPlw

 

这篇关于Nodejs实现爬虫的几种方式的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!