GetChongQingArticleService.java 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. package com.example.mos.service.GetWebArticle;
  2. import com.example.mos.common.model.dto.ArticleInfoDTO;
  3. import com.example.mos.service.ArticleInfoService;
  4. import lombok.RequiredArgsConstructor;
  5. import lombok.extern.slf4j.Slf4j;
  6. import org.apache.http.HttpResponse;
  7. import org.apache.http.client.methods.HttpGet;
  8. import org.apache.http.impl.client.CloseableHttpClient;
  9. import org.apache.http.impl.client.HttpClients;
  10. import org.apache.http.util.EntityUtils;
  11. import org.jsoup.Jsoup;
  12. import org.jsoup.nodes.Document;
  13. import org.jsoup.nodes.Element;
  14. import org.jsoup.select.Elements;
  15. import org.springframework.stereotype.Service;
  16. import java.nio.charset.Charset;
  17. import java.nio.charset.StandardCharsets;
  18. import java.text.ParseException;
  19. import java.text.SimpleDateFormat;
  20. import java.util.ArrayList;
  21. import java.util.Date;
  22. import java.util.List;
  23. /**
  24. * 重庆市知识产权局
  25. * @Author xiexiang
  26. * @Date 2024/8/19
  27. */
  28. @Slf4j
  29. @Service
  30. @RequiredArgsConstructor
  31. public class GetChongQingArticleService {
  32. private final ArticleInfoService articleInfoService;
  33. public String getChongQing() {
  34. String responseStr = "success";
  35. List<String> responseBodies = new ArrayList<>();
  36. List<String> urls = new ArrayList<>();
  37. urls.add("https://zscqj.cq.gov.cn/zwxx_232/tzgg/");
  38. CloseableHttpClient httpClient = HttpClients.createDefault();
  39. try {
  40. for (String url : urls) {
  41. HttpGet request = new HttpGet(url);
  42. setupHeaders(request);
  43. HttpResponse response = httpClient.execute(request);
  44. String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
  45. responseBodies.add(responseBody);
  46. responseStr = responseBody;
  47. }
  48. httpClient.close();
  49. if (responseBodies != null && !responseBodies.isEmpty()) {
  50. this.readJson(responseBodies);
  51. }
  52. } catch (Exception e) {
  53. e.printStackTrace();
  54. }
  55. return responseStr;
  56. }
  57. private void setupHeaders(HttpGet request) {
  58. request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0");
  59. request.addHeader("Cookie", "lcid=1043; __jsluid_s=d8da8b71aed1f47e6d74773d87ebf074; _va_ref=%5B%22%22%2C%22%22%2C1724059829%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuiYh-tKeu5W26B6buTNuL3NeDZ5sZfdDhBXd4R344papXwOqiQ_DhapSKUUahDaN%26wd%3D%26eqid%3Dc0d9d9b6002526d400000004668cdae9%22%5D; _va_id=f7c6b7152dd01f89.1720512427.4.1724059829.1724059829.; _va_ses=*");
  60. }
  61. private void readJson(List<String> responseBodies) {
  62. try {
  63. List<ArticleInfoDTO> articleInfoDTOList = new ArrayList<>();
  64. for (String responseBody : responseBodies) {
  65. // 解析包含 JSON 数据的 <abbr> 标签
  66. Document doc = Jsoup.parse(responseBody);
  67. Elements liElements = doc.select("div.gl-con.rt ul.gl-list li");
  68. if (liElements != null && !liElements.isEmpty()) {
  69. for (Element li : liElements) {
  70. String title = li.select("a").attr("title");
  71. String url = li.select("a").attr("href");
  72. String time = li.select("span").text();
  73. if (!time.isEmpty() && !url.isEmpty()) {
  74. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  75. Date date = new Date();
  76. if (!time.isEmpty()) {
  77. try {
  78. date = dateFormat.parse(time);
  79. } catch (ParseException e) {
  80. System.out.println("time parse error");
  81. continue;
  82. }
  83. }
  84. ArticleInfoDTO articleInfoDTO = new ArticleInfoDTO();
  85. articleInfoDTO.setTitle(title);
  86. if (url.contains("https")) {
  87. articleInfoDTO.setLink(url);
  88. } else {
  89. articleInfoDTO.setLink("https://zscqj.cq.gov.cn/zwxx_232/tzgg/" + url.substring(2));
  90. }
  91. articleInfoDTO.setTime(date);
  92. articleInfoDTO.setFakeId("8");
  93. articleInfoDTO.setSource(1);
  94. // 添加到列表
  95. articleInfoDTOList.add(articleInfoDTO);
  96. }
  97. }
  98. }
  99. }
  100. // 将提取的信息添加到数据库或进行其他操作
  101. articleInfoService.addArticles(articleInfoDTOList);
  102. } catch (Exception e) {
  103. e.printStackTrace();
  104. }
  105. }
  106. }