|
@@ -1,31 +1,48 @@
|
|
|
package com.cslg.ppa.service.GetWebArticle;
|
|
|
|
|
|
+import com.alibaba.fastjson.JSONArray;
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
|
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
|
|
|
import com.cslg.ppa.common.utils.DateUtil;
|
|
|
+import com.cslg.ppa.common.utils.RegexUtil;
|
|
|
import com.cslg.ppa.dto.GetArticleInfoDTO;
|
|
|
+import com.cslg.ppa.dto.HubeiArticleDTO;
|
|
|
import com.cslg.ppa.entity.ArticleInfo;
|
|
|
+import com.cslg.ppa.entity.SourceInfo;
|
|
|
+import com.cslg.ppa.mapper.SourceInfoMapper;
|
|
|
+import com.cslg.ppa.service.ArticleInfoService;
|
|
|
import com.cslg.ppa.service.commom.DifyService;
|
|
|
import com.cslg.ppa.service.commom.XmlParseService;
|
|
|
+import com.google.gson.Gson;
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
+import okhttp3.MediaType;
|
|
|
+import okhttp3.OkHttpClient;
|
|
|
+import okhttp3.Request;
|
|
|
+import okhttp3.RequestBody;
|
|
|
import org.apache.commons.lang3.ObjectUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.apache.http.HttpResponse;
|
|
|
+import org.apache.http.client.methods.HttpGet;
|
|
|
+import org.apache.http.impl.client.CloseableHttpClient;
|
|
|
+import org.apache.http.impl.client.HttpClients;
|
|
|
+import org.apache.http.util.EntityUtils;
|
|
|
+import org.apache.poi.hssf.record.DVALRecord;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
|
import org.jsoup.nodes.Element;
|
|
|
-import org.jsoup.nodes.Node;
|
|
|
import org.jsoup.select.Elements;
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.transaction.annotation.Propagation;
|
|
|
+import org.springframework.transaction.annotation.Transactional;
|
|
|
import org.springframework.util.CollectionUtils;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
-import java.text.ParseException;
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
|
import java.text.SimpleDateFormat;
|
|
|
-import java.util.ArrayList;
|
|
|
-import java.util.Arrays;
|
|
|
-import java.util.Date;
|
|
|
-import java.util.List;
|
|
|
+import java.util.*;
|
|
|
import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
import java.util.stream.Collectors;
|
|
@@ -34,28 +51,94 @@ import java.util.stream.Collectors;
|
|
|
@Service
|
|
|
@RequiredArgsConstructor
|
|
|
public class GetProvinceNewsService {
|
|
|
+ private final ArticleInfoService articleInfoService;
|
|
|
private final XmlParseService xmlParseService;
|
|
|
@Autowired
|
|
|
private DifyService difyService;
|
|
|
+ @Autowired
|
|
|
+ private SourceInfoMapper sourceInfoMapper;
|
|
|
+
|
|
|
|
|
|
- private static List<String> TitleList = Arrays.asList("要闻动态", "通知公告");
|
|
|
- private static final Pattern DATE_PATTERN = Pattern.compile("(?<!\\d)(\\d{4}-\\d{2}-\\d{2})(?!\\d)");
|
|
|
+ private static List<Integer> SourceList = Arrays.asList(1);
|
|
|
+ private static List<String> TitleList = Arrays.asList("要闻动态", "通知公告", "公示公告", "公告");
|
|
|
// 新闻动态项的CSS选择器 (通用配置,适用于大部分政府网站)
|
|
|
private static final String NEWS_CONTAINER_SELECTOR = "div.con-right-list,div.main-content-right,div.tab-content,div.subpageCon-con,div.gl-main,div.nymain," +
|
|
|
- "div.contain";
|
|
|
+ "div.contain,div.newsList,div.mainContent,div.gly,div.lists,div.mod_page_box,div.t_con,div.yearReport-con,div.innovate,ul.news-list," +
|
|
|
+ "div.top_right_con,div.listright,div.gl,div.list_container,div.zkmm_right,#subNavgsgg,div.content_right,div.r_box,div.m_t_12,div.zw-con,div.list-fr";
|
|
|
private static final String TITLE_SELECTOR = "a";
|
|
|
private static final String LINK_SELECTOR = "a[href]";
|
|
|
|
|
|
- public void crawlArticles(String baseUrl) throws IOException {
|
|
|
+ /**
|
|
|
+ * 抓取国内各大省份地区相关知识产权局资讯
|
|
|
+ *
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+// @Scheduled(cron = "0 0 2 * * ?")
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
|
|
|
+ public void addArticleFromWebSource() throws Exception {
|
|
|
+ System.out.println(new Date() + "Province-Begin");
|
|
|
+ List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
|
|
|
+ .eq(SourceInfo::getSourceType, 1)
|
|
|
+ .eq(SourceInfo::getWebType, 2));
|
|
|
+ if (!CollectionUtils.isEmpty(sourceInfos)) {
|
|
|
+ for (SourceInfo sourceInfo : sourceInfos) {
|
|
|
+ try {
|
|
|
+ Thread.sleep(1000);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ Integer id = sourceInfo.getId();
|
|
|
+ String sourceName = sourceInfo.getSourceName();
|
|
|
+ String sourceUrl = sourceInfo.getSourceUrl();
|
|
|
+ if (StringUtils.equals("河北省市场监督管理局(知识产权局)", sourceName)) {
|
|
|
+ try {
|
|
|
+ this.crawlHebeiArticles(sourceUrl, sourceName, id, 2);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
|
|
|
+ }
|
|
|
+ } else if (StringUtils.equals("江西省市场监督管理局(知识产权局)", sourceName)) {
|
|
|
+ try {
|
|
|
+ this.crawlJiangxiArticles(sourceUrl, sourceName, id, 2);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
|
|
|
+ }
|
|
|
+ } else if (StringUtils.equals("湖北省知识产权局", sourceName)) {
|
|
|
+ try {
|
|
|
+ this.crawlHubeiArticles(sourceUrl, sourceName, id, 2);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ try {
|
|
|
+ this.crawlArticles(sourceUrl, sourceName, id, 2);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ System.out.println(new Date() + "Province-Begin");
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 通用性抓取各大省份知识产权局资讯
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void crawlArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
// 使用Jsoup连接并解析网页
|
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
|
- .timeout(15000) // 增加超时时间
|
|
|
+ .timeout(20000) // 增加超时时间
|
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
|
|
|
.followRedirects(true) // 跟随重定向
|
|
|
.get();
|
|
|
|
|
|
// 改进的新闻列表抓取逻辑
|
|
|
- Elements newsItems = doc.select(LINK_SELECTOR);
|
|
|
+ Elements newsItems = doc.select(LINK_SELECTOR);
|
|
|
List<String> list = new ArrayList<>();
|
|
|
for (Element item : newsItems) {
|
|
|
try {
|
|
@@ -64,9 +147,14 @@ public class GetProvinceNewsService {
|
|
|
|
|
|
if (titleElement != null && linkElement != null) {
|
|
|
String title = titleElement.text().trim();
|
|
|
- if (TitleList.contains(title)) {
|
|
|
+ if (baseUrl.contains("amr.hainan.gov.cn") && StringUtils.equals("通知公告", title)) {
|
|
|
String link = linkElement.absUrl("href"); // 获取绝对URL
|
|
|
list.add(link);
|
|
|
+ } else {
|
|
|
+ if (TitleList.contains(title) && !baseUrl.contains("amr.hainan.gov.cn")) {
|
|
|
+ String link = linkElement.absUrl("href"); // 获取绝对URL
|
|
|
+ list.add(link);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
@@ -76,13 +164,26 @@ public class GetProvinceNewsService {
|
|
|
if (!CollectionUtils.isEmpty(list)) {
|
|
|
List<String> collect = list.stream().distinct().collect(Collectors.toList());
|
|
|
for (String url : collect) {
|
|
|
- this.crawlArticlesDetail(url);
|
|
|
+ this.crawlArticlesDetail(url, sourceName, sourceId, type);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public List<GetArticleInfoDTO> crawlArticlesDetail(String baseUrl) throws IOException {
|
|
|
- List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ /**
|
|
|
+ * 抓取具体某个知识产权局过程详情
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void crawlArticlesDetail(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
+ try {
|
|
|
+ Thread.sleep(1000);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
// 使用Jsoup连接并解析网页
|
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
|
.timeout(15000) // 增加超时时间
|
|
@@ -92,11 +193,13 @@ public class GetProvinceNewsService {
|
|
|
|
|
|
// 改进的新闻列表抓取逻辑
|
|
|
Element container = doc.selectFirst(NEWS_CONTAINER_SELECTOR);
|
|
|
+ if (container == null && baseUrl.contains("www.ahippc.cn")) {
|
|
|
+ container = doc.selectFirst("div.right");
|
|
|
+ }
|
|
|
Element scriptElement = container.select("script").first();
|
|
|
List<String> reStrs = new ArrayList<>();
|
|
|
if (scriptElement == null) {
|
|
|
Elements elements = container.select("li");
|
|
|
-// List<String> collect = elements.stream().map(Node::outerHtml).collect(Collectors.toList());
|
|
|
for (Element element : elements) {
|
|
|
String liTag = element.outerHtml().trim();
|
|
|
reStrs.add(liTag);
|
|
@@ -112,31 +215,57 @@ public class GetProvinceNewsService {
|
|
|
}
|
|
|
}
|
|
|
if (CollectionUtils.isEmpty(reStrs)) {
|
|
|
- Elements elements = container.select("li");
|
|
|
-// List<String> collect = elements.stream().map(Node::outerHtml).collect(Collectors.toList());
|
|
|
+ Elements elements = null;
|
|
|
+ if (baseUrl.contains("www.ahippc.cn")) {
|
|
|
+ elements = container.select("dd.fix");
|
|
|
+ } else {
|
|
|
+ elements = container.select("li");
|
|
|
+ }
|
|
|
for (Element element : elements) {
|
|
|
String liTag = element.outerHtml().trim();
|
|
|
reStrs.add(liTag);
|
|
|
}
|
|
|
}
|
|
|
- int count = 1;
|
|
|
+
|
|
|
+ commonProvinceAddArticle(reStrs, baseUrl, sourceName, sourceId, type);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 将具体知识产权局相关资讯信息添加到资讯表中
|
|
|
+ *
|
|
|
+ * @param reStrs
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ */
|
|
|
+ public void commonProvinceAddArticle(List<String> reStrs, String baseUrl, String sourceName, Integer sourceId, Integer type) {
|
|
|
+ List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ int count = 0;
|
|
|
for (String reStr : reStrs) {
|
|
|
// 限制处理的新闻项数量,避免处理过多数据
|
|
|
- if (count >= 10) {
|
|
|
- log.info("已达到处理上限(30条),停止处理更多新闻项");
|
|
|
+ if (count > 20) {
|
|
|
+ log.info("已达到处理上限(20条),停止处理更多新闻项");
|
|
|
break;
|
|
|
}
|
|
|
Document document = Jsoup.parse(reStr);
|
|
|
Element linkElement = document.getElementsByTag("a").first();
|
|
|
+ if (baseUrl.contains("scjg.jl.gov.cn")) {
|
|
|
+ Elements linkElements = document.getElementsByTag("a");
|
|
|
+ linkElement = linkElements.get(1);
|
|
|
+ }
|
|
|
//获取时间
|
|
|
String dateStr = document.getElementsByTag("span").text().trim();
|
|
|
- if (baseUrl.contains("zjippc.org.cn")) {
|
|
|
- dateStr = extractDate(dateStr);
|
|
|
+ if (baseUrl.contains("zjippc.org.cn") || baseUrl.contains("www.sxippc.com")) {
|
|
|
+ dateStr = RegexUtil.extractDate(dateStr);
|
|
|
+ } else if (baseUrl.contains("amr.hainan.gov.cn") || baseUrl.contains("scjg.jl.gov.cn")) {
|
|
|
+ dateStr = document.getElementsByTag("em").text().trim();
|
|
|
+ dateStr = getHainanDate(dateStr);
|
|
|
}
|
|
|
String yesterdayDateStr = DateUtil.getYesterdayDateStr();
|
|
|
-// if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
-// continue;
|
|
|
-// }
|
|
|
+ if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
Date date = new Date();
|
|
|
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
|
|
try {
|
|
@@ -146,14 +275,21 @@ public class GetProvinceNewsService {
|
|
|
}
|
|
|
|
|
|
String title = linkElement.text();
|
|
|
- if (baseUrl.contains("zjippc.org.cn")) {
|
|
|
- title = title.replace(dateStr, "");
|
|
|
+ if (baseUrl.contains("zjippc.org.cn") || baseUrl.contains("www.sxippc.com") || baseUrl.contains("hlipa.hlj.gov.cn") || baseUrl.contains("scjg.nx.gov.cn")) {
|
|
|
+ title = title.replace(dateStr, "").trim();
|
|
|
+ } else if (baseUrl.contains("amr.yn.gov.cn") || baseUrl.contains("amr.nmg.gov.cn") || baseUrl.contains("scjgj.qinghai.gov.cn")
|
|
|
+ || baseUrl.contains("scjgj.sc.gov.cn") || baseUrl.contains("scjg.jl.gov.cn")
|
|
|
+ || baseUrl.contains("amr.guizhou.gov.cn") || baseUrl.contains("zscq.tj.gov.cn")) {
|
|
|
+ title = linkElement.attr("title").trim();
|
|
|
+ }
|
|
|
+ if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
|
|
|
+ continue;
|
|
|
}
|
|
|
// 检查文章是否已存在
|
|
|
-// ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
-// if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
-// continue; // 文章已存在,跳过
|
|
|
-// }
|
|
|
+ ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
+ if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
+ continue; // 文章已存在,跳过
|
|
|
+ }
|
|
|
|
|
|
String link = linkElement.absUrl("href");
|
|
|
// 如果absUrl没有返回有效链接,尝试其他方式
|
|
@@ -176,38 +312,47 @@ public class GetProvinceNewsService {
|
|
|
try {
|
|
|
digest = this.getDigest(link);
|
|
|
} catch (Exception e) {
|
|
|
- System.out.println(e);
|
|
|
- System.out.println(link);
|
|
|
- System.out.println(title);
|
|
|
+ digest = "";
|
|
|
+
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(digest)) {
|
|
|
continue;
|
|
|
}
|
|
|
-// if (StringUtils.isEmpty(digest)) {
|
|
|
-// continue;
|
|
|
-// }
|
|
|
String condensedAbstract = null;
|
|
|
-// try {
|
|
|
-// condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
-// } catch (Exception e) {
|
|
|
-//
|
|
|
-// }
|
|
|
-// if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
-// continue;
|
|
|
-// }
|
|
|
+ try {
|
|
|
+ condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
|
-// articleInfoDTO.setCategoryId(type);
|
|
|
- articleInfoDTO.setSourceId(1);
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
|
articleInfoDTO.setArticleUrl(link);
|
|
|
articleInfoDTO.setTitle(title);
|
|
|
articleInfoDTO.setPublicTime(date);
|
|
|
- articleInfoDTO.setDigest(digest);
|
|
|
-// articleInfoDTOS.add(articleInfoDTO);
|
|
|
+ articleInfoDTO.setDigest(condensedAbstract);
|
|
|
+ articleInfoDTOS.add(articleInfoDTO);
|
|
|
count++;
|
|
|
}
|
|
|
-// articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
- return articleInfoDTOS;
|
|
|
+ articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 获取资讯通知内容
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @return
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
public String getDigest(String baseUrl) throws IOException {
|
|
|
+ try {
|
|
|
+ Thread.sleep(1000);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
// 使用Jsoup连接并解析网页
|
|
|
Document doc = Jsoup.connect(baseUrl)
|
|
|
.timeout(15000) // 增加超时时间
|
|
@@ -235,19 +380,329 @@ public class GetProvinceNewsService {
|
|
|
return content;
|
|
|
}
|
|
|
|
|
|
- public static String extractDate(String input) {
|
|
|
+ //获取浙江省知识产权保护中心资讯详情里的日期
|
|
|
+ public static String getHainanDate(String input) {
|
|
|
if (input == null || input.trim().isEmpty()) {
|
|
|
return null;
|
|
|
}
|
|
|
+ // 正则表达式匹配YYYY-MM-DD格式
|
|
|
+ Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");
|
|
|
+ Matcher matcher = pattern.matcher(input);
|
|
|
+
|
|
|
+ String date = null;
|
|
|
+ if (matcher.find()) {
|
|
|
+ date = matcher.group();
|
|
|
+ }
|
|
|
+ return date;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void setupHeaders(HttpGet request) {
|
|
|
+ request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0");
|
|
|
+ request.addHeader("Cookie", "lcid=1043; __jsluid_s=d8da8b71aed1f47e6d74773d87ebf074; _va_ref=%5B%22%22%2C%22%22%2C1724059829%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuiYh-tKeu5W26B6buTNuL3NeDZ5sZfdDhBXd4R344papXwOqiQ_DhapSKUUahDaN%26wd%3D%26eqid%3Dc0d9d9b6002526d400000004668cdae9%22%5D; _va_id=f7c6b7152dd01f89.1720512427.4.1724059829.1724059829.; _va_ses=*");
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 抓取湖北省知识产权局资讯
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void crawlHubeiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
+ String url = baseUrl + "fbjd/zc/qtzdgkwj/qtgkwj.json";
|
|
|
+ List<String> responseBodies = new ArrayList<>();
|
|
|
+ CloseableHttpClient httpClient = HttpClients.createDefault();
|
|
|
+ try {
|
|
|
+ HttpGet request = new HttpGet(url);
|
|
|
+ setupHeaders(request);
|
|
|
+ HttpResponse response = httpClient.execute(request);
|
|
|
+ String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
|
|
|
+ responseBodies.add(responseBody);
|
|
|
+ if (!responseBodies.isEmpty()) {
|
|
|
+ for (String res : responseBodies) {
|
|
|
+ JSONObject object = JSONObject.parseObject(res);
|
|
|
+ List<HubeiArticleDTO> list = JSONObject.parseArray(object.getString("data"), HubeiArticleDTO.class);
|
|
|
+ List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ int count = 0;
|
|
|
+ for (HubeiArticleDTO articleDTO : list) {
|
|
|
+ // 限制处理的新闻项数量,避免处理过多数据
|
|
|
+ if (count > 20) {
|
|
|
+ log.info("已达到处理上限(20条),停止处理更多新闻项");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ String docreltime = articleDTO.getDOCRELTIME();
|
|
|
+ String dateStr = RegexUtil.extractDate(docreltime);
|
|
|
+ String yesterdayDateStr = DateUtil.getYesterdayDateStr();
|
|
|
+ if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ Date date = new Date();
|
|
|
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
|
|
+ try {
|
|
|
+ date = dateFormat.parse(dateStr);
|
|
|
+ } catch (Exception e) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String title = articleDTO.getFILENAME();
|
|
|
+ if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // 检查文章是否已存在
|
|
|
+ ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
+ if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
+ continue; // 文章已存在,跳过
|
|
|
+ }
|
|
|
+ String link = articleDTO.getURL();
|
|
|
+ String digest = null;
|
|
|
+ try {
|
|
|
+ digest = this.getDigest(link);
|
|
|
+ } catch (Exception e) {
|
|
|
+ digest = "";
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(digest)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String condensedAbstract = null;
|
|
|
+ try {
|
|
|
+ condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
|
+ articleInfoDTO.setArticleUrl(link);
|
|
|
+ articleInfoDTO.setTitle(title);
|
|
|
+ articleInfoDTO.setPublicTime(date);
|
|
|
+ articleInfoDTO.setDigest(condensedAbstract);
|
|
|
+ articleInfoDTOS.add(articleInfoDTO);
|
|
|
+ count++;
|
|
|
+ }
|
|
|
+ articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 抓取河北省知识产权局资讯
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void crawlHebeiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
+ // 使用Jsoup连接并解析网页
|
|
|
+ Document doc = Jsoup.connect(baseUrl)
|
|
|
+ .timeout(20000) // 增加超时时间
|
|
|
+ .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
|
|
|
+ .followRedirects(true) // 跟随重定向
|
|
|
+ .get();
|
|
|
+
|
|
|
+ // 改进的新闻列表抓取逻辑
|
|
|
+ Elements newsItems = doc.select(LINK_SELECTOR);
|
|
|
+ List<String> list = new ArrayList<>();
|
|
|
+ for (Element item : newsItems) {
|
|
|
+ try {
|
|
|
+ String href = item.attr("href");
|
|
|
+ if (StringUtils.equals(href, "/node/919")) {
|
|
|
+ String s = item.absUrl("href");
|
|
|
+ list.add(s);
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("解析单个新闻项时出错: ", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (String url : list) {
|
|
|
+ addHebeiArticle(url, sourceName, sourceId, type);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 将河北知识产权局资讯添加到资讯表中
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void addHebeiArticle(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
+ // 使用Jsoup连接并解析网页
|
|
|
+ Document doc = Jsoup.connect(baseUrl)
|
|
|
+ .timeout(15000) // 增加超时时间
|
|
|
+ .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
|
|
|
+ .followRedirects(true) // 跟随重定向
|
|
|
+ .get();
|
|
|
|
|
|
- Matcher matcher = DATE_PATTERN.matcher(input);
|
|
|
- String lastMatch = null;
|
|
|
+ // 改进的新闻列表抓取逻辑
|
|
|
+ Element container = doc.selectFirst(NEWS_CONTAINER_SELECTOR);
|
|
|
+ Elements linkElements = container.select("a.zkmmr_tl1_item_a");
|
|
|
+ int count = 0;
|
|
|
+ List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ for (Element linkElement : linkElements) {
|
|
|
+ // 限制处理的新闻项数量,避免处理过多数据
|
|
|
+ if (count > 20) {
|
|
|
+ log.info("已达到处理上限(20条),停止处理更多新闻项");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ Element element = linkElement.select("p.zkmmr_tl1_item_date").first();
|
|
|
+ String dateStr = element.text().trim();
|
|
|
+ String yesterdayDateStr = DateUtil.getYesterdayDateStr();
|
|
|
+ if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ Date date = new Date();
|
|
|
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
|
|
+ try {
|
|
|
+ date = dateFormat.parse(dateStr);
|
|
|
+ } catch (Exception e) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ String title = linkElement.text().replace(dateStr, "").trim();
|
|
|
+ if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // 检查文章是否已存在
|
|
|
+ ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
+ if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
+ continue; // 文章已存在,跳过
|
|
|
+ }
|
|
|
+
|
|
|
+ String link = linkElement.absUrl("href");
|
|
|
+ String digest = null;
|
|
|
+ try {
|
|
|
+ digest = this.getDigest(link);
|
|
|
+ } catch (Exception e) {
|
|
|
+ digest = "";
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(digest)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String condensedAbstract = null;
|
|
|
+ try {
|
|
|
+ condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
+ } catch (Exception e) {
|
|
|
|
|
|
- // 查找所有匹配项并记录最后一个
|
|
|
- while (matcher.find()) {
|
|
|
- lastMatch = matcher.group(1);
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
|
+ articleInfoDTO.setArticleUrl(link);
|
|
|
+ articleInfoDTO.setTitle(title);
|
|
|
+ articleInfoDTO.setPublicTime(date);
|
|
|
+ articleInfoDTO.setDigest(condensedAbstract);
|
|
|
+ articleInfoDTOS.add(articleInfoDTO);
|
|
|
+ count++;
|
|
|
}
|
|
|
+ articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 抓取新疆知识产权局资讯到咨询表中
|
|
|
+ *
|
|
|
+ * @param baseUrl
|
|
|
+ * @param sourceName
|
|
|
+ * @param sourceId
|
|
|
+ * @param type
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+ public void crawlJiangxiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
|
|
|
+ Map<String, Object> map = new HashMap<>();
|
|
|
+ String param = new Gson().toJson(map);
|
|
|
+ RequestBody requestBody = RequestBody.create(MediaType.parse("application/json"), param);
|
|
|
+ OkHttpClient okHttpClient = new OkHttpClient();
|
|
|
+ Request request = new Request.Builder()
|
|
|
+ .url("https://amr.jiangxi.gov.cn/queryList?current=1&unitid=368486&webSiteCode%5B%5D=amr&channelCode%5B%5D=tzgg&dataBefore=&dataAfter=&perPage=13&showMode=full&groupSize=1&barPosition=bottom&titleMax=34&templateContainerId=datalist&themeName=default&pageSize=13")
|
|
|
+ .post(requestBody)
|
|
|
+ .build();
|
|
|
+ String resBody = Objects.requireNonNull(okHttpClient.newCall(request).execute().body()).string();
|
|
|
+ JSONArray results = null;
|
|
|
+ try {
|
|
|
+ JSONObject jsonObject = JSONObject.parseObject(resBody);
|
|
|
+ JSONObject object = jsonObject.getJSONObject("data");
|
|
|
+ results = object.getJSONArray("results");
|
|
|
+ List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ int count = 0;
|
|
|
+ for (int i = 0; i < results.size(); i++) {
|
|
|
+ // 限制处理的新闻项数量,避免处理过多数据
|
|
|
+ if (count > 20) {
|
|
|
+ log.info("已达到处理上限(20条),停止处理更多新闻项");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ JSONObject item = results.getJSONObject(i);
|
|
|
+ JSONObject source = item.getJSONObject("source");
|
|
|
|
|
|
- return lastMatch;
|
|
|
+ String pubDate = source.get("pubDate").toString();
|
|
|
+ String dateStr = RegexUtil.extractDate(pubDate);
|
|
|
+ String yesterdayDateStr = DateUtil.getYesterdayDateStr();
|
|
|
+ if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ Date date = new Date();
|
|
|
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
|
|
+ try {
|
|
|
+ date = dateFormat.parse(dateStr);
|
|
|
+ } catch (Exception e) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String title = source.get("title").toString();
|
|
|
+ if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ // 检查文章是否已存在
|
|
|
+ ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
|
|
|
+ if (ObjectUtils.isNotEmpty(articleInfo)) {
|
|
|
+ continue; // 文章已存在,跳过
|
|
|
+ }
|
|
|
+ JSONObject urlObject = source.getJSONObject("urls");
|
|
|
+ String url = urlObject.get("pc").toString();
|
|
|
+ String link = baseUrl + url.substring(1);
|
|
|
+ String digest = null;
|
|
|
+ try {
|
|
|
+ digest = this.getDigest(link);
|
|
|
+ } catch (Exception e) {
|
|
|
+ digest = "";
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(digest)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String condensedAbstract = null;
|
|
|
+ try {
|
|
|
+ condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
|
+ articleInfoDTO.setArticleUrl(link);
|
|
|
+ articleInfoDTO.setTitle(title);
|
|
|
+ articleInfoDTO.setPublicTime(date);
|
|
|
+ articleInfoDTO.setDigest(condensedAbstract);
|
|
|
+ articleInfoDTOS.add(articleInfoDTO);
|
|
|
+ count++;
|
|
|
+ }
|
|
|
+ articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
}
|
|
|
+
|
|
|
}
|