`

HttpClient模拟浏览器抓取网页

阅读更多
1. 设置请求头消息User-Agent模拟浏览器

Demo01.java

package com.andrew.httpClient.chap02;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

public class Demo01 {
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建httpClient实例
        HttpGet httpGet = new HttpGet("http://www.tuicool.com/"); // 创建http get实例
        // 模拟浏览器
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
        CloseableHttpResponse response = httpClient.execute(httpGet); // 执行http get请求
        HttpEntity entity = response.getEntity(); // 获取返回实体
        System.out.println("网页内容:" + EntityUtils.toString(entity, "utf-8")); // 获取网页内容
        response.close(); // response关闭
        httpClient.close(); // httpClient关闭
    }
}


2. 获取响应内容类型Content-Type

Demo02.java

package com.andrew.httpClient.chap02;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class Demo02 {
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建httpClient实例
        HttpGet httpGet = new HttpGet("http://central.maven.org/maven2/HTTPClient/HTTPClient/0.3-3/HTTPClient-0.3-3.jar"); // 创建httpget实例
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
        CloseableHttpResponse response = httpClient.execute(httpGet); // 执行http get请求
        HttpEntity entity = response.getEntity(); // 获取返回实体
        System.out.println("Content-Type:" + entity.getContentType().getValue());
        // System.out.println("网页内容:"+EntityUtils.toString(entity, "utf-8")); //
        // 获取网页内容
        response.close(); // response关闭
        httpClient.close(); // httpClient关闭
    }
}

运行结果:
Content-Type:application/java-archive


3. 获取响应状态Status

200正常
403拒绝
500服务器报错
400未找到页面


Demo03.java

package com.andrew.httpClient.chap02;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class Demo03 {
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建httpClient实例
        HttpGet httpGet = new HttpGet("http://www.open1111.com/"); // 创建httpget实例
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
        CloseableHttpResponse response = httpClient.execute(httpGet); // 执行http get请求
        System.out.println("Status:" + response.getStatusLine().getStatusCode());
        HttpEntity entity = response.getEntity(); // 获取返回实体
        System.out.println("Content-Type:" + entity.getContentType().getValue());
        // System.out.println("网页内容:"+EntityUtils.toString(entity, "utf-8")); //
        // 获取网页内容
        response.close(); // response关闭
        httpClient.close(); // httpClient关闭
    }
}

Status:200
Content-Type:text/html;charset=UTF-8
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics