| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- package com.cslg.ppa.service.GetWebArticle;
- import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
- import com.cslg.ppa.common.utils.DateUtil;
- import com.cslg.ppa.common.utils.RegexUtil;
- import com.cslg.ppa.dto.GetArticleInfoDTO;
- import com.cslg.ppa.entity.ArticleInfo;
- import com.cslg.ppa.entity.SourceInfo;
- import com.cslg.ppa.mapper.SourceInfoMapper;
- import com.cslg.ppa.service.ArticleInfoService;
- import com.cslg.ppa.service.commom.DifyService;
- import com.cslg.ppa.service.commom.XmlParseService;
- import lombok.RequiredArgsConstructor;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.http.HttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.scheduling.annotation.Scheduled;
- import org.springframework.stereotype.Service;
- import org.springframework.transaction.annotation.Propagation;
- import org.springframework.transaction.annotation.Transactional;
- import org.springframework.util.CollectionUtils;
- import java.io.IOException;
- import java.text.SimpleDateFormat;
- import java.time.LocalDate;
- import java.time.format.DateTimeFormatter;
- import java.util.*;
- @Slf4j
- @Service
- @RequiredArgsConstructor
- public class GetEcigaretteService {
- private final ArticleInfoService articleInfoService;
- @Autowired
- private DifyService difyService;
- @Autowired
- private SourceInfoMapper sourceInfoMapper;
- // @Scheduled(cron = "0 0 5 * * ?")
- // @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
- public void execute() {
- System.out.println(new Date() + "-------AddArticle-Begin");
- List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
- .in(SourceInfo::getCategoryId, Arrays.asList(16, 17, 18, 19, 20, 21, 22)));
- for (SourceInfo sourceInfo : sourceInfos) {
- String sourceUrl = sourceInfo.getSourceUrl();
- String sourceName = sourceInfo.getSourceName();
- Integer categoryId = sourceInfo.getCategoryId();
- Integer sourceInfoId = sourceInfo.getId();
- this.crawlEcigaretteArticles(sourceUrl, categoryId, sourceInfoId, sourceName);
- }
- System.out.println(new Date() + "-------AddArticle-End");
- }
- public void crawlEcigaretteArticles(String baseUrl, Integer type, Integer sourceId, String sourceName) {
- List<GetArticleInfoDTO> articleInfoDTOS = null;
- try {
- // 使用Jsoup连接并解析网页
- Document doc = Jsoup.connect(baseUrl)
- .timeout(30000) // 增加超时时间
- .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
- .followRedirects(true) // 跟随重定向
- .get();
- Elements elements = doc.select("div.list-box");
- Elements newsItems = elements.select("a[href]");
- articleInfoDTOS = new ArrayList<>();
- int sum = 0;
- for (Element item : newsItems) {
- if (sum > 10) {
- break;
- }
- //获取时间
- Elements timeElement = item.select("span.time");
- String dateStr = timeElement.text().trim();
- String yesterdayDateStr = DateUtil.getYesterdayDateStr();
- if (!StringUtils.equals(dateStr, "1天前")) {
- break;
- }
- Date date = new Date();
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
- try {
- date = dateFormat.parse(yesterdayDateStr);
- } catch (Exception e) {
- continue;
- }
- Element linkElement = item.selectFirst("a[href]");
- String link = linkElement.absUrl("href");
- Elements titleElement = item.select("div.title");
- String title = titleElement.text().trim();
- // 使用
- if (shouldSkipTobaccoTitle(sourceName, title)) {
- continue;
- }
- // 检查文章是否已存在
- long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
- .eq(ArticleInfo::getTitle, title)
- .or()
- .eq(ArticleInfo::getArticleUrl, link));
- if (count > 0) {
- continue; // 文章已存在,跳过
- }
- Elements descElement = item.select("div.desc");
- String desc = descElement.text().trim();
- Map<String, String> map = null;
- try {
- map = this.getDigest(link);
- } catch (IOException e) {
- continue;
- }
- String aiContent = map.get("aiContent");
- String content = map.get("content");
- if (StringUtils.isEmpty(aiContent)) {
- continue;
- }
- String condensedAbstract = null;
- try {
- condensedAbstract = difyService.getEcigaretteSummary(aiContent);
- } catch (Exception ignored) {
- }
- if (StringUtils.isEmpty(condensedAbstract)) {
- continue;
- }
- GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
- articleInfoDTO.setCategoryId(type);
- articleInfoDTO.setSourceId(sourceId);
- articleInfoDTO.setArticleUrl(link);
- articleInfoDTO.setTitle(title);
- articleInfoDTO.setPublicTime(date);
- articleInfoDTO.setDigest(condensedAbstract);
- articleInfoDTO.setOldContent(desc + "\n" + content);
- articleInfoDTOS.add(articleInfoDTO);
- sum++;
- }
- articleInfoService.batchAddArticleInfo(articleInfoDTOS);
- } catch (Exception e) {
- System.out.println("Read Json Error" + e);
- }
- }
- /**
- * 检查是否需要跳过当前标题(适用于"两个至上-诉讼"来源)
- */
- private static boolean shouldSkipTobaccoTitle(String sourceName, String title) {
- if (!StringUtils.equals(sourceName, "两个至上-诉讼")) {
- return false;
- }
- if (StringUtils.isEmpty(title) || !title.contains("烟")) {
- return false;
- }
- // 排除关键词
- String[] excludedKeywords = {"烟店", "零售商", "囚犯","未成年","嫌疑人"};
- for (String keyword : excludedKeywords) {
- if (title.contains(keyword)) {
- return true;
- }
- }
- return false; // 满足所有条件,不跳过
- }
- public Map<String, String> getDigest(String url) throws IOException {
- Map<String, String> map = null;
- CloseableHttpClient httpClient = HttpClients.createDefault();
- HttpGet request = new HttpGet(url);
- HttpResponse response = httpClient.execute(request);
- String responseBody = EntityUtils.toString(response.getEntity());
- httpClient.close();
- if (responseBody != null) {
- map = this.readJson(responseBody);
- }
- return map;
- }
- public Map<String, String> readJson(String responseBody) {
- Map<String, String> map = new HashMap<>();
- String content = "";
- try {
- //使用JSoup解析HTML
- Document doc = Jsoup.parse(responseBody);
- // 获取description元数据
- Element element = doc.selectFirst("div.ckeditor");
- String text1 = element.text();
- // 获取h2标签的文本
- List<String> list = new ArrayList<>();
- Element element1 = element.selectFirst("ul, ol");
- if (element1 != null) {
- list.add("核心看点:");
- Elements liElements = element1.select("li");
- int i = 1;
- for (Element ment : liElements) {
- String text = ment.text();
- if (StringUtils.isNotEmpty(text)) {
- list.add("(" + i + ")" + text);
- i++;
- }
- }
- if (list.size() == 1) {
- String articleContent = element.text();
- list.add(articleContent);
- content = StringUtils.join(list, "\n");
- } else {
- content = StringUtils.join(list, "\n");
- }
- } else {
- String articleContent = element.text();
- list.add(articleContent);
- content = StringUtils.join(list, "\n");
- }
- map.put("aiContent", text1);
- map.put("content", content);
- } catch (Exception e) {
- content = "";
- map.put("content", content);
- }
- return map;
- }
- public void crawlEcigaretteArticles1(String baseUrl, Integer type, Integer sourceId, String sourceName) {
- List<GetArticleInfoDTO> articleInfoDTOS = null;
- try {
- // 使用Jsoup连接并解析网页
- Document doc = Jsoup.connect(baseUrl)
- .timeout(30000) // 增加超时时间
- .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
- .followRedirects(true) // 跟随重定向
- .get();
- Elements elements = doc.select("div.list-box");
- Elements newsItems = elements.select("a[href]");
- articleInfoDTOS = new ArrayList<>();
- int sum = 0;
- for (Element item : newsItems) {
- //获取时间
- Elements timeElement = item.select("span.time");
- String dateStr = timeElement.text().trim();
- String yesterdayDateStr = null;
- if (dateStr.contains("小时")) {
- yesterdayDateStr = LocalDate.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
- } else if (StringUtils.equals(dateStr, "1天前")) {
- yesterdayDateStr = DateUtil.getYesterdayDateStr();
- } else if (StringUtils.equals(dateStr, "2天前")) {
- yesterdayDateStr = LocalDate.now().minusDays(2).format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
- } else {
- yesterdayDateStr = LocalDate.now().minusDays(3).format(DateTimeFormatter.ofPattern("yyyy")) + "-" + dateStr;
- }
- Date date = new Date();
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
- try {
- date = dateFormat.parse(yesterdayDateStr);
- } catch (Exception e) {
- System.out.println("time error" + yesterdayDateStr);
- continue;
- }
- Element linkElement = item.selectFirst("a[href]");
- String link = linkElement.absUrl("href");
- Elements titleElement = item.select("div.title");
- String title = titleElement.text().trim();
- // 使用
- if (shouldSkipTobaccoTitle(sourceName, title)) {
- continue;
- }
- // 检查文章是否已存在
- long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
- .eq(ArticleInfo::getTitle, title)
- .or()
- .eq(ArticleInfo::getArticleUrl, link));
- if (count > 0) {
- continue; // 文章已存在,跳过
- }
- Elements descElement = item.select("div.desc");
- String desc = descElement.text().trim();
- Map<String, String> map = null;
- try {
- map = this.getDigest(link);
- } catch (IOException e) {
- continue;
- }
- String aiContent = map.get("aiContent");
- String content = map.get("content");
- if (StringUtils.isEmpty(aiContent)) {
- continue;
- }
- String condensedAbstract = null;
- try {
- condensedAbstract = difyService.getEcigaretteSummary(aiContent);
- } catch (Exception ignored) {
- }
- if (StringUtils.isEmpty(condensedAbstract)) {
- continue;
- }
- GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
- articleInfoDTO.setCategoryId(type);
- articleInfoDTO.setSourceId(sourceId);
- articleInfoDTO.setArticleUrl(link);
- articleInfoDTO.setTitle(title);
- articleInfoDTO.setPublicTime(date);
- articleInfoDTO.setDigest(condensedAbstract);
- articleInfoDTO.setOldContent(desc + "\n" + content);
- articleInfoDTOS.add(articleInfoDTO);
- sum++;
- }
- articleInfoService.batchAddArticleInfo(articleInfoDTOS);
- } catch (Exception e) {
- System.out.println("Read Json Error" + e);
- }
- }
- }
|