|
@@ -1,10 +1,13 @@
|
|
package com.cslg.ppa.service.GetWebArticle;
|
|
package com.cslg.ppa.service.GetWebArticle;
|
|
|
|
|
|
|
|
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
|
|
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
|
|
import com.cslg.ppa.common.utils.DateUtil;
|
|
import com.cslg.ppa.common.utils.DateUtil;
|
|
import com.cslg.ppa.common.utils.RegexUtil;
|
|
import com.cslg.ppa.common.utils.RegexUtil;
|
|
import com.cslg.ppa.dto.GetArticleInfoDTO;
|
|
import com.cslg.ppa.dto.GetArticleInfoDTO;
|
|
import com.cslg.ppa.entity.ArticleInfo;
|
|
import com.cslg.ppa.entity.ArticleInfo;
|
|
|
|
+import com.cslg.ppa.entity.SourceInfo;
|
|
|
|
+import com.cslg.ppa.mapper.SourceInfoMapper;
|
|
import com.cslg.ppa.service.ArticleInfoService;
|
|
import com.cslg.ppa.service.ArticleInfoService;
|
|
import com.cslg.ppa.service.commom.DifyService;
|
|
import com.cslg.ppa.service.commom.DifyService;
|
|
import lombok.RequiredArgsConstructor;
|
|
import lombok.RequiredArgsConstructor;
|
|
@@ -16,6 +19,7 @@ import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import org.jsoup.select.Elements;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
+import org.springframework.scheduling.annotation.Scheduled;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.transaction.annotation.Propagation;
|
|
import org.springframework.transaction.annotation.Propagation;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
import org.springframework.transaction.annotation.Transactional;
|
|
@@ -26,6 +30,7 @@ import java.text.SimpleDateFormat;
|
|
import java.util.ArrayList;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.Date;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
|
|
+import java.util.stream.Collectors;
|
|
|
|
|
|
@Slf4j
|
|
@Slf4j
|
|
@Service
|
|
@Service
|
|
@@ -34,16 +39,72 @@ public class GetCNIPRArticleService {
|
|
private final ArticleInfoService articleInfoService;
|
|
private final ArticleInfoService articleInfoService;
|
|
@Autowired
|
|
@Autowired
|
|
private DifyService difyService;
|
|
private DifyService difyService;
|
|
|
|
+ @Autowired
|
|
|
|
+ private SourceInfoMapper sourceInfoMapper;
|
|
|
|
+
|
|
|
|
+ private static final String LINK_SELECTOR = "a[href]";
|
|
|
|
+ private static final String TITLE_SELECTOR = "a";
|
|
|
|
|
|
/***
|
|
/***
|
|
* 中国知识产权网
|
|
* 中国知识产权网
|
|
- * @param baseUrl
|
|
|
|
* @throws IOException
|
|
* @throws IOException
|
|
*/
|
|
*/
|
|
- // @Scheduled(cron = "0 0 2 * * ?")
|
|
|
|
- @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
|
|
|
|
- public void crawCniprArticle(String baseUrl) throws IOException {
|
|
|
|
|
|
+ @Scheduled(cron = "0 0 4 * * ?")
|
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
|
|
|
|
+ public void crawCniprArticle() throws IOException {
|
|
System.out.println(new Date() + "CNIPR-Begin");
|
|
System.out.println(new Date() + "CNIPR-Begin");
|
|
|
|
+ SourceInfo sourceInfo = sourceInfoMapper.selectOne(new LambdaQueryWrapper<SourceInfo>()
|
|
|
|
+ .eq(SourceInfo::getWebType, 3));
|
|
|
|
+ if (ObjectUtils.isNotEmpty(sourceInfo)) {
|
|
|
|
+ String baseUrl = sourceInfo.getSourceUrl();
|
|
|
|
+ Integer sourceInfoId = sourceInfo.getId();
|
|
|
|
+ List<String> list = new ArrayList<>();
|
|
|
|
+ try {
|
|
|
|
+ Thread.sleep(1000);
|
|
|
|
+ // 使用Jsoup连接并解析网页
|
|
|
|
+ Document doc = Jsoup.connect(baseUrl)
|
|
|
|
+ .timeout(20000) // 增加超时时间
|
|
|
|
+ .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
|
|
|
|
+ .followRedirects(true) // 跟随重定向
|
|
|
|
+ .get();
|
|
|
|
+
|
|
|
|
+ // 改进的新闻列表抓取逻辑
|
|
|
|
+ Elements newsItems = doc.select(LINK_SELECTOR);
|
|
|
|
+ for (Element item : newsItems) {
|
|
|
|
+ try {
|
|
|
|
+ Element titleElement = item.selectFirst(TITLE_SELECTOR);
|
|
|
|
+ Element linkElement = item.selectFirst(LINK_SELECTOR);
|
|
|
|
+
|
|
|
|
+ if (titleElement != null && linkElement != null) {
|
|
|
|
+ String title = titleElement.text().trim();
|
|
|
|
+ if (StringUtils.equals("通知公告", title) || StringUtils.equals("新闻资讯", title)) {
|
|
|
|
+ String link = linkElement.absUrl("href"); // 获取绝对URL
|
|
|
|
+ list.add(link);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ log.warn("解析单个新闻项时出错: ", e);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } catch (Exception ignored) {
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (!CollectionUtils.isEmpty(list)) {
|
|
|
|
+ List<String> collect = list.stream().distinct().collect(Collectors.toList());
|
|
|
|
+ for (String url : collect) {
|
|
|
|
+ try {
|
|
|
|
+ this.crawCniprArticleDetail(url, sourceInfoId, 6);
|
|
|
|
+ } catch (IOException ignored) {
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ System.out.println(new Date() + "CNIPR-End");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public void crawCniprArticleDetail(String baseUrl,Integer sourceId, Integer type) throws IOException {
|
|
// 使用Jsoup连接并解析网页
|
|
// 使用Jsoup连接并解析网页
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
.timeout(20000) // 增加超时时间
|
|
.timeout(20000) // 增加超时时间
|
|
@@ -57,6 +118,9 @@ public class GetCNIPRArticleService {
|
|
List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
for (Element linkElement : linkElements) {
|
|
for (Element linkElement : linkElements) {
|
|
Element element = linkElement.getElementsByTag("a").first();
|
|
Element element = linkElement.getElementsByTag("a").first();
|
|
|
|
+ if (element == null) {
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
String link = element.absUrl("href");
|
|
String link = element.absUrl("href");
|
|
// 如果absUrl没有返回有效链接,尝试其他方式
|
|
// 如果absUrl没有返回有效链接,尝试其他方式
|
|
if (StringUtils.isEmpty(link)) {
|
|
if (StringUtils.isEmpty(link)) {
|
|
@@ -75,7 +139,7 @@ public class GetCNIPRArticleService {
|
|
}
|
|
}
|
|
|
|
|
|
try {
|
|
try {
|
|
- GetArticleInfoDTO articleInfoDTO = this.addCniprArticle(link);
|
|
|
|
|
|
+ GetArticleInfoDTO articleInfoDTO = this.addCniprArticle(link, sourceId, type);
|
|
if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
|
|
if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
|
|
articleInfoDTOS.add(articleInfoDTO);
|
|
articleInfoDTOS.add(articleInfoDTO);
|
|
}
|
|
}
|
|
@@ -83,10 +147,9 @@ public class GetCNIPRArticleService {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
- System.out.println(new Date() + "CNIPR-Begin");
|
|
|
|
}
|
|
}
|
|
|
|
|
|
- public GetArticleInfoDTO addCniprArticle(String baseUrl) throws IOException {
|
|
|
|
|
|
+ public GetArticleInfoDTO addCniprArticle(String baseUrl, Integer sourceId, Integer type) throws IOException {
|
|
// 使用Jsoup连接并解析网页
|
|
// 使用Jsoup连接并解析网页
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
.timeout(15000) // 增加超时时间
|
|
.timeout(15000) // 增加超时时间
|
|
@@ -113,8 +176,9 @@ public class GetCNIPRArticleService {
|
|
Element titleElement = initElement.selectFirst("div.xq_cont_title");
|
|
Element titleElement = initElement.selectFirst("div.xq_cont_title");
|
|
String title = titleElement.select("p").first().text();
|
|
String title = titleElement.select("p").first().text();
|
|
// 检查文章是否已存在
|
|
// 检查文章是否已存在
|
|
- ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
|
- if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
|
|
|
+ long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
|
|
|
|
+ .eq(ArticleInfo::getTitle, title));
|
|
|
|
+ if (count > 0) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -142,8 +206,8 @@ public class GetCNIPRArticleService {
|
|
condensedAbstract = "";
|
|
condensedAbstract = "";
|
|
}
|
|
}
|
|
GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
- articleInfoDTO.setCategoryId(6);
|
|
|
|
- articleInfoDTO.setSourceId(38);
|
|
|
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
articleInfoDTO.setArticleUrl(baseUrl);
|
|
articleInfoDTO.setArticleUrl(baseUrl);
|
|
articleInfoDTO.setTitle(title);
|
|
articleInfoDTO.setTitle(title);
|
|
articleInfoDTO.setPublicTime(date);
|
|
articleInfoDTO.setPublicTime(date);
|