1.需要导入的资源
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
2.这是我自己写的工具类,用于模拟get和Post请求, headers是需要自己填的请求头信息
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.zip.GZIPInputStream;
public class HTTPClientUtil {
//发送get请求
public String testGet(String url, Map<String, String> headers) throws Exception {
System.out.println("请求方式:GET");
System.out.println("请求URL:"+url);
String re="";
// 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建GET请求
HttpGet httpGet = new HttpGet(url);
{
for (String key:headers.keySet()){
httpGet.setHeader(key,headers.get(key));
}
}
// 获取响应结果
CloseableHttpResponse response = httpClient.execute(httpGet);
System.out.println("响应码:"+response.getStatusLine().getStatusCode());
if(response.getStatusLine().getStatusCode()==200)
{
String result = ParseResponse(response);
httpClient.close();
response.close();
return result;
}else{
httpClient.close();
response.close();
return null;
}
}
//发送post请求
public String testPost(Map<String, String> data,Map<String, String> headers) throws Exception {
System.out.println("请求方式:POST");
System.out.println("请求url:"+data.get("url"));
// 创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建GET请求
HttpPost httpGet = new HttpPost(data.get("url"));
{
for (String key:headers.keySet()){
httpGet.setHeader(key,headers.get(key));
}
}
// 设置请求体
if(data.containsKey("data"))
{
System.out.println("请求体:"+data.get("data"));
StringEntity params = new StringEntity(data.get("data"));
httpGet.setEntity(params);
}
// 获取响应结果
CloseableHttpResponse response = httpClient.execute(httpGet);
System.out.println("响应码:"+response.getStatusLine().getStatusCode());
if(response.getStatusLine().getStatusCode()==200)
{
String result = ParseResponse(response);
httpClient.close();
response.close();
return result;
}else{
httpClient.close();
response.close();
return null;
}
}
public String ParseResponse(CloseableHttpResponse response){
String re=null;
try {
HttpEntity entity = response.getEntity();
// 检查响应是否被gzip压缩
if (false) {
System.out.println("检查响应被gzip压缩");
InputStream gzipStream = entity.getContent();
InputStreamReader gzipStreamReader = new InputStreamReader(new GZIPInputStream(gzipStream));
BufferedReader bufferedReader = new BufferedReader(gzipStreamReader);
StringBuilder stringBuilder = new StringBuilder();
String line;
while ((line = bufferedReader.readLine()) != null) {
System.out.println("打印内容:"+line);
stringBuilder.append(line).append("\n");
}
// 打印未压缩的HTML内容
re=stringBuilder.toString();
} else {
// 如果不是gzip压缩,直接打印内容
re= EntityUtils.toString(entity,"UTF-8" );
}
}catch (Exception e){
e.printStackTrace();
}
return re;
}
}
3.这是爬虫一个网站 用到Jsoup的部分方法,仅供参考
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class JsoupUtil {
// 辅助方法:判断一个字符是否是汉字(这里简化判断,只判断是否在汉字字符集范围内)
private static boolean isHanzi(char c) {
return c >= '\u4e00' && c <= '\u9fa5';
}
private Map<String, Object> parseHtml_text_page(String html) {
Map<String, Object> map = new HashMap<>();
try {
String text = "";
List<String> list = new ArrayList<>();
// 获取html的文档对象
Document doc = Jsoup.parse(html);
try {
网页的文章内容
Element content = doc.getElementById("chapterinfo");
text = content.html();
} catch (Exception e) {
e.printStackTrace();
}
//解析页数
try {
//根据class找组件
Elements element2 = doc.getElementsByClass("chapterPages");
//根据id找组件
//doc.getElementById()
String arr2 = element2.text();
if (arr2 != null && arr2.length() > 0) {
System.out.println(arr2);
String[] split = arr2.split("【");
//【1】【2】【3】【4】【5】【6】
if (split.length > 1) {
for (int i = 1; i < split.length; i++) {
//获取a标签包含的内容获取a标签
Elements select = doc.select("a:contains(" + "【" + split[i] + ")");
String href = select.get(0).attr("href");
if (href.contains("html"))
list.add(href);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
map.put("text", "");
map.put("list", list);
System.out.println("章节页数:" + list.toString());
//System.out.println("章节文本:"+text);
} catch (Exception e) {
e.printStackTrace();
}
return map;
}
}