package com.example.mos.service.GetWebArticle; import com.example.mos.common.model.dto.ArticleInfoDTO; import com.example.mos.service.ArticleInfoService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Service; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; /** * 重庆市知识产权局 * @Author xiexiang * @Date 2024/8/19 */ @Slf4j @Service @RequiredArgsConstructor public class GetChongQingArticleService { private final ArticleInfoService articleInfoService; public String getChongQing() { String responseStr = "success"; List responseBodies = new ArrayList<>(); List urls = new ArrayList<>(); urls.add("https://zscqj.cq.gov.cn/zwxx_232/tzgg/"); CloseableHttpClient httpClient = HttpClients.createDefault(); try { for (String url : urls) { HttpGet request = new HttpGet(url); setupHeaders(request); HttpResponse response = httpClient.execute(request); String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); responseBodies.add(responseBody); responseStr = responseBody; } httpClient.close(); if (responseBodies != null && !responseBodies.isEmpty()) { this.readJson(responseBodies); } } catch (Exception e) { e.printStackTrace(); } return responseStr; } private void setupHeaders(HttpGet request) { request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"); request.addHeader("Cookie", "lcid=1043; __jsluid_s=d8da8b71aed1f47e6d74773d87ebf074; _va_ref=%5B%22%22%2C%22%22%2C1724059829%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuiYh-tKeu5W26B6buTNuL3NeDZ5sZfdDhBXd4R344papXwOqiQ_DhapSKUUahDaN%26wd%3D%26eqid%3Dc0d9d9b6002526d400000004668cdae9%22%5D; _va_id=f7c6b7152dd01f89.1720512427.4.1724059829.1724059829.; _va_ses=*"); } private void readJson(List responseBodies) { try { List articleInfoDTOList = new ArrayList<>(); for (String responseBody : responseBodies) { // 解析包含 JSON 数据的 标签 Document doc = Jsoup.parse(responseBody); Elements liElements = doc.select("div.gl-con.rt ul.gl-list li"); if (liElements != null && !liElements.isEmpty()) { for (Element li : liElements) { String title = li.select("a").attr("title"); String url = li.select("a").attr("href"); String time = li.select("span").text(); if (!time.isEmpty() && !url.isEmpty()) { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); Date date = new Date(); if (!time.isEmpty()) { try { date = dateFormat.parse(time); } catch (ParseException e) { System.out.println("time parse error"); continue; } } ArticleInfoDTO articleInfoDTO = new ArticleInfoDTO(); articleInfoDTO.setTitle(title); if (url.contains("https")) { articleInfoDTO.setLink(url); } else { articleInfoDTO.setLink("https://zscqj.cq.gov.cn/zwxx_232/tzgg/" + url.substring(2)); } articleInfoDTO.setTime(date); articleInfoDTO.setFakeId("8"); articleInfoDTO.setSource(1); // 添加到列表 articleInfoDTOList.add(articleInfoDTO); } } } } // 将提取的信息添加到数据库或进行其他操作 articleInfoService.addArticles(articleInfoDTOList); } catch (Exception e) { e.printStackTrace(); } } }