123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- package com.example.mos.service.GetWebArticle;
- import com.example.mos.common.model.dto.ArticleInfoDTO;
- import com.example.mos.service.ArticleInfoService;
- import lombok.RequiredArgsConstructor;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.http.HttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.stereotype.Service;
- import java.nio.charset.Charset;
- import java.nio.charset.StandardCharsets;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- /**
- * 重庆市知识产权局
- * @Author xiexiang
- * @Date 2024/8/19
- */
- @Slf4j
- @Service
- @RequiredArgsConstructor
- public class GetChongQingArticleService {
- private final ArticleInfoService articleInfoService;
- public String getChongQing() {
- String responseStr = "success";
- List<String> responseBodies = new ArrayList<>();
- List<String> urls = new ArrayList<>();
- urls.add("https://zscqj.cq.gov.cn/zwxx_232/tzgg/");
- CloseableHttpClient httpClient = HttpClients.createDefault();
- try {
- for (String url : urls) {
- HttpGet request = new HttpGet(url);
- setupHeaders(request);
- HttpResponse response = httpClient.execute(request);
- String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
- responseBodies.add(responseBody);
- responseStr = responseBody;
- }
- httpClient.close();
- if (responseBodies != null && !responseBodies.isEmpty()) {
- this.readJson(responseBodies);
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- return responseStr;
- }
- private void setupHeaders(HttpGet request) {
- request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0");
- request.addHeader("Cookie", "lcid=1043; __jsluid_s=d8da8b71aed1f47e6d74773d87ebf074; _va_ref=%5B%22%22%2C%22%22%2C1724059829%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuiYh-tKeu5W26B6buTNuL3NeDZ5sZfdDhBXd4R344papXwOqiQ_DhapSKUUahDaN%26wd%3D%26eqid%3Dc0d9d9b6002526d400000004668cdae9%22%5D; _va_id=f7c6b7152dd01f89.1720512427.4.1724059829.1724059829.; _va_ses=*");
- }
- private void readJson(List<String> responseBodies) {
- try {
- List<ArticleInfoDTO> articleInfoDTOList = new ArrayList<>();
- for (String responseBody : responseBodies) {
- // 解析包含 JSON 数据的 <abbr> 标签
- Document doc = Jsoup.parse(responseBody);
- Elements liElements = doc.select("div.gl-con.rt ul.gl-list li");
- if (liElements != null && !liElements.isEmpty()) {
- for (Element li : liElements) {
- String title = li.select("a").attr("title");
- String url = li.select("a").attr("href");
- String time = li.select("span").text();
- if (!time.isEmpty() && !url.isEmpty()) {
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
- Date date = new Date();
- if (!time.isEmpty()) {
- try {
- date = dateFormat.parse(time);
- } catch (ParseException e) {
- System.out.println("time parse error");
- continue;
- }
- }
- ArticleInfoDTO articleInfoDTO = new ArticleInfoDTO();
- articleInfoDTO.setTitle(title);
- if (url.contains("https")) {
- articleInfoDTO.setLink(url);
- } else {
- articleInfoDTO.setLink("https://zscqj.cq.gov.cn/zwxx_232/tzgg/" + url.substring(2));
- }
- articleInfoDTO.setTime(date);
- articleInfoDTO.setFakeId("8");
- articleInfoDTO.setSource(1);
- // 添加到列表
- articleInfoDTOList.add(articleInfoDTO);
- }
- }
- }
- }
- // 将提取的信息添加到数据库或进行其他操作
- articleInfoService.addArticles(articleInfoDTOList);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
|