GetEcigaretteService.java 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. package com.cslg.ppa.service.GetWebArticle;
  2. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  3. import com.cslg.ppa.common.utils.DateUtil;
  4. import com.cslg.ppa.common.utils.RegexUtil;
  5. import com.cslg.ppa.dto.GetArticleInfoDTO;
  6. import com.cslg.ppa.entity.ArticleInfo;
  7. import com.cslg.ppa.entity.SourceInfo;
  8. import com.cslg.ppa.mapper.SourceInfoMapper;
  9. import com.cslg.ppa.service.ArticleInfoService;
  10. import com.cslg.ppa.service.commom.DifyService;
  11. import com.cslg.ppa.service.commom.XmlParseService;
  12. import lombok.RequiredArgsConstructor;
  13. import lombok.extern.slf4j.Slf4j;
  14. import org.apache.commons.lang3.StringUtils;
  15. import org.apache.http.HttpResponse;
  16. import org.apache.http.client.methods.HttpGet;
  17. import org.apache.http.impl.client.CloseableHttpClient;
  18. import org.apache.http.impl.client.HttpClients;
  19. import org.apache.http.util.EntityUtils;
  20. import org.jsoup.Jsoup;
  21. import org.jsoup.nodes.Document;
  22. import org.jsoup.nodes.Element;
  23. import org.jsoup.select.Elements;
  24. import org.springframework.beans.factory.annotation.Autowired;
  25. import org.springframework.scheduling.annotation.Scheduled;
  26. import org.springframework.stereotype.Service;
  27. import org.springframework.transaction.annotation.Propagation;
  28. import org.springframework.transaction.annotation.Transactional;
  29. import org.springframework.util.CollectionUtils;
  30. import java.io.IOException;
  31. import java.text.SimpleDateFormat;
  32. import java.time.LocalDate;
  33. import java.time.format.DateTimeFormatter;
  34. import java.util.*;
  35. @Slf4j
  36. @Service
  37. @RequiredArgsConstructor
  38. public class GetEcigaretteService {
  39. private final ArticleInfoService articleInfoService;
  40. @Autowired
  41. private DifyService difyService;
  42. @Autowired
  43. private SourceInfoMapper sourceInfoMapper;
  44. // @Scheduled(cron = "0 0 5 * * ?")
  45. // @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
  46. public void execute() {
  47. System.out.println(new Date() + "-------AddArticle-Begin");
  48. List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
  49. .in(SourceInfo::getCategoryId, Arrays.asList(16, 17, 18, 19, 20, 21, 22)));
  50. for (SourceInfo sourceInfo : sourceInfos) {
  51. String sourceUrl = sourceInfo.getSourceUrl();
  52. String sourceName = sourceInfo.getSourceName();
  53. Integer categoryId = sourceInfo.getCategoryId();
  54. Integer sourceInfoId = sourceInfo.getId();
  55. this.crawlEcigaretteArticles(sourceUrl, categoryId, sourceInfoId, sourceName);
  56. }
  57. System.out.println(new Date() + "-------AddArticle-End");
  58. }
  59. public void crawlEcigaretteArticles(String baseUrl, Integer type, Integer sourceId, String sourceName) {
  60. List<GetArticleInfoDTO> articleInfoDTOS = null;
  61. try {
  62. // 使用Jsoup连接并解析网页
  63. Document doc = Jsoup.connect(baseUrl)
  64. .timeout(30000) // 增加超时时间
  65. .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
  66. .followRedirects(true) // 跟随重定向
  67. .get();
  68. Elements elements = doc.select("div.list-box");
  69. Elements newsItems = elements.select("a[href]");
  70. articleInfoDTOS = new ArrayList<>();
  71. int sum = 0;
  72. for (Element item : newsItems) {
  73. if (sum > 10) {
  74. break;
  75. }
  76. //获取时间
  77. Elements timeElement = item.select("span.time");
  78. String dateStr = timeElement.text().trim();
  79. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  80. if (!StringUtils.equals(dateStr, "1天前")) {
  81. break;
  82. }
  83. Date date = new Date();
  84. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  85. try {
  86. date = dateFormat.parse(yesterdayDateStr);
  87. } catch (Exception e) {
  88. continue;
  89. }
  90. Element linkElement = item.selectFirst("a[href]");
  91. String link = linkElement.absUrl("href");
  92. Elements titleElement = item.select("div.title");
  93. String title = titleElement.text().trim();
  94. // 使用
  95. if (shouldSkipTobaccoTitle(sourceName, title)) {
  96. continue;
  97. }
  98. // 检查文章是否已存在
  99. long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  100. .eq(ArticleInfo::getTitle, title)
  101. .or()
  102. .eq(ArticleInfo::getArticleUrl, link));
  103. if (count > 0) {
  104. continue; // 文章已存在,跳过
  105. }
  106. Elements descElement = item.select("div.desc");
  107. String desc = descElement.text().trim();
  108. Map<String, String> map = null;
  109. try {
  110. map = this.getDigest(link);
  111. } catch (IOException e) {
  112. continue;
  113. }
  114. String aiContent = map.get("aiContent");
  115. String content = map.get("content");
  116. if (StringUtils.isEmpty(aiContent)) {
  117. continue;
  118. }
  119. String condensedAbstract = null;
  120. try {
  121. condensedAbstract = difyService.getEcigaretteSummary(aiContent);
  122. } catch (Exception ignored) {
  123. }
  124. if (StringUtils.isEmpty(condensedAbstract)) {
  125. continue;
  126. }
  127. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  128. articleInfoDTO.setCategoryId(type);
  129. articleInfoDTO.setSourceId(sourceId);
  130. articleInfoDTO.setArticleUrl(link);
  131. articleInfoDTO.setTitle(title);
  132. articleInfoDTO.setPublicTime(date);
  133. articleInfoDTO.setDigest(condensedAbstract);
  134. articleInfoDTO.setOldContent(desc + "\n" + content);
  135. articleInfoDTOS.add(articleInfoDTO);
  136. sum++;
  137. }
  138. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  139. } catch (Exception e) {
  140. System.out.println("Read Json Error" + e);
  141. }
  142. }
  143. /**
  144. * 检查是否需要跳过当前标题(适用于"两个至上-诉讼"来源)
  145. */
  146. private static boolean shouldSkipTobaccoTitle(String sourceName, String title) {
  147. if (!StringUtils.equals(sourceName, "两个至上-诉讼")) {
  148. return false;
  149. }
  150. if (StringUtils.isEmpty(title) || !title.contains("烟")) {
  151. return false;
  152. }
  153. // 排除关键词
  154. String[] excludedKeywords = {"烟店", "零售商", "囚犯","未成年","嫌疑人"};
  155. for (String keyword : excludedKeywords) {
  156. if (title.contains(keyword)) {
  157. return true;
  158. }
  159. }
  160. return false; // 满足所有条件,不跳过
  161. }
  162. public Map<String, String> getDigest(String url) throws IOException {
  163. Map<String, String> map = null;
  164. CloseableHttpClient httpClient = HttpClients.createDefault();
  165. HttpGet request = new HttpGet(url);
  166. HttpResponse response = httpClient.execute(request);
  167. String responseBody = EntityUtils.toString(response.getEntity());
  168. httpClient.close();
  169. if (responseBody != null) {
  170. map = this.readJson(responseBody);
  171. }
  172. return map;
  173. }
  174. public Map<String, String> readJson(String responseBody) {
  175. Map<String, String> map = new HashMap<>();
  176. String content = "";
  177. try {
  178. //使用JSoup解析HTML
  179. Document doc = Jsoup.parse(responseBody);
  180. // 获取description元数据
  181. Element element = doc.selectFirst("div.ckeditor");
  182. String text1 = element.text();
  183. // 获取h2标签的文本
  184. List<String> list = new ArrayList<>();
  185. Element element1 = element.selectFirst("ul, ol");
  186. if (element1 != null) {
  187. list.add("核心看点:");
  188. Elements liElements = element1.select("li");
  189. int i = 1;
  190. for (Element ment : liElements) {
  191. String text = ment.text();
  192. if (StringUtils.isNotEmpty(text)) {
  193. list.add("(" + i + ")" + text);
  194. i++;
  195. }
  196. }
  197. if (list.size() == 1) {
  198. String articleContent = element.text();
  199. list.add(articleContent);
  200. content = StringUtils.join(list, "\n");
  201. } else {
  202. content = StringUtils.join(list, "\n");
  203. }
  204. } else {
  205. String articleContent = element.text();
  206. list.add(articleContent);
  207. content = StringUtils.join(list, "\n");
  208. }
  209. map.put("aiContent", text1);
  210. map.put("content", content);
  211. } catch (Exception e) {
  212. content = "";
  213. map.put("content", content);
  214. }
  215. return map;
  216. }
  217. public void crawlEcigaretteArticles1(String baseUrl, Integer type, Integer sourceId, String sourceName) {
  218. List<GetArticleInfoDTO> articleInfoDTOS = null;
  219. try {
  220. // 使用Jsoup连接并解析网页
  221. Document doc = Jsoup.connect(baseUrl)
  222. .timeout(30000) // 增加超时时间
  223. .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
  224. .followRedirects(true) // 跟随重定向
  225. .get();
  226. Elements elements = doc.select("div.list-box");
  227. Elements newsItems = elements.select("a[href]");
  228. articleInfoDTOS = new ArrayList<>();
  229. int sum = 0;
  230. for (Element item : newsItems) {
  231. //获取时间
  232. Elements timeElement = item.select("span.time");
  233. String dateStr = timeElement.text().trim();
  234. String yesterdayDateStr = null;
  235. if (dateStr.contains("小时")) {
  236. yesterdayDateStr = LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
  237. } else if (StringUtils.equals(dateStr, "1天前")) {
  238. yesterdayDateStr = DateUtil.getYesterdayDateStr();
  239. } else if (StringUtils.equals(dateStr, "2天前")) {
  240. yesterdayDateStr = LocalDate.now().minusDays(2).format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
  241. } else {
  242. yesterdayDateStr = LocalDate.now().minusDays(3).format(DateTimeFormatter.ofPattern("yyyy")) + "-" + dateStr;
  243. }
  244. Date date = new Date();
  245. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  246. try {
  247. date = dateFormat.parse(yesterdayDateStr);
  248. } catch (Exception e) {
  249. System.out.println("time error" + yesterdayDateStr);
  250. continue;
  251. }
  252. Element linkElement = item.selectFirst("a[href]");
  253. String link = linkElement.absUrl("href");
  254. Elements titleElement = item.select("div.title");
  255. String title = titleElement.text().trim();
  256. // 使用
  257. if (shouldSkipTobaccoTitle(sourceName, title)) {
  258. continue;
  259. }
  260. // 检查文章是否已存在
  261. long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  262. .eq(ArticleInfo::getTitle, title)
  263. .or()
  264. .eq(ArticleInfo::getArticleUrl, link));
  265. if (count > 0) {
  266. continue; // 文章已存在,跳过
  267. }
  268. Elements descElement = item.select("div.desc");
  269. String desc = descElement.text().trim();
  270. Map<String, String> map = null;
  271. try {
  272. map = this.getDigest(link);
  273. } catch (IOException e) {
  274. continue;
  275. }
  276. String aiContent = map.get("aiContent");
  277. String content = map.get("content");
  278. if (StringUtils.isEmpty(aiContent)) {
  279. continue;
  280. }
  281. String condensedAbstract = null;
  282. try {
  283. condensedAbstract = difyService.getEcigaretteSummary(aiContent);
  284. } catch (Exception ignored) {
  285. }
  286. if (StringUtils.isEmpty(condensedAbstract)) {
  287. continue;
  288. }
  289. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  290. articleInfoDTO.setCategoryId(type);
  291. articleInfoDTO.setSourceId(sourceId);
  292. articleInfoDTO.setArticleUrl(link);
  293. articleInfoDTO.setTitle(title);
  294. articleInfoDTO.setPublicTime(date);
  295. articleInfoDTO.setDigest(condensedAbstract);
  296. articleInfoDTO.setOldContent(desc + "\n" + content);
  297. articleInfoDTOS.add(articleInfoDTO);
  298. sum++;
  299. }
  300. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  301. } catch (Exception e) {
  302. System.out.println("Read Json Error" + e);
  303. }
  304. }
  305. }