Просмотр исходного кода

新增国内省份、智南针、中国知识产权网资讯

zero 2 недель назад
Родитель
Сommit
3b4e0fdcff

+ 17 - 0
src/main/java/com/cslg/ppa/common/utils/RegexUtil.java

@@ -76,5 +76,22 @@ public class RegexUtil {
             }
         }
         return result;
+
+    }
+    private static final Pattern DATE_PATTERN = Pattern.compile("(?<!\\d)(\\d{4}-\\d{2}-\\d{2})(?!\\d)");
+    public static String extractDate(String input) {
+        if (input == null || input.trim().isEmpty()) {
+            return null;
+        }
+
+        Matcher matcher = DATE_PATTERN.matcher(input);
+        String lastMatch = null;
+
+        // 查找所有匹配项并记录最后一个
+        while (matcher.find()) {
+            lastMatch = matcher.group(1);
+        }
+
+        return lastMatch;
     }
 }

+ 46 - 0
src/main/java/com/cslg/ppa/dto/HubeiArticleDTO.java

@@ -0,0 +1,46 @@
+package com.cslg.ppa.dto;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import lombok.Data;
+import org.springframework.format.annotation.DateTimeFormat;
+
+import java.util.Date;
+
+@Data
+public class HubeiArticleDTO {
+
+    private String IdxID;
+
+    //url
+    private String URL;
+
+    //标题
+    private String FILENAME;
+
+    //类型
+    private String FILETYPE;
+
+    private String FILENUM;
+
+    private String PUBLISHER;
+
+    //发文时间
+    private String PUBDATE;
+
+    //发布时间
+    private String DOCRELTIME;
+
+    private String EFFECTIVESTATE;
+
+    private String EFECTDATE;
+
+    private String ABOLIDATE;
+
+    private String SUBJECTCLASS;
+
+    private String REVISIONNOTES;
+
+    private String RECURL;
+
+    private String XGJD;
+}

+ 7 - 0
src/main/java/com/cslg/ppa/entity/SourceInfo.java

@@ -28,4 +28,11 @@ public class SourceInfo extends BaseEntity<SourceInfo> {
     //图标
     @TableField(value = "icon")
     private String icon;
+
+    //网址类型
+    //1 国家类型
+    //2 地区类型
+    //3 其他
+    @TableField(value = "web_type")
+    private Integer webType;
 }

+ 3 - 1
src/main/java/com/cslg/ppa/service/GetWebArticle/GetCNIPAArticleService.java

@@ -43,6 +43,9 @@ public class GetCNIPAArticleService {
     @Autowired
     private DifyService difyService;
 
+    /**
+     * 国家知识产权局
+     */
     @Scheduled(cron = "0 0 2 * * ?")
     @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
     public void getCNIPA(){
@@ -138,7 +141,6 @@ public class GetCNIPAArticleService {
                 articleInfoDTO.setTitle(title);
                 articleInfoDTO.setPublicTime(date);
                 articleInfoDTO.setDigest(condensedAbstract);
-//                articleInfoDTO.setPctDigest(pctCondensedAbstract);
                 articleInfoDTOS.add(articleInfoDTO);
             }
             articleInfoService.batchAddArticleInfo(articleInfoDTOS);

+ 153 - 0
src/main/java/com/cslg/ppa/service/GetWebArticle/GetCNIPRArticleService.java

@@ -0,0 +1,153 @@
+package com.cslg.ppa.service.GetWebArticle;
+
+import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
+import com.cslg.ppa.common.utils.DateUtil;
+import com.cslg.ppa.common.utils.RegexUtil;
+import com.cslg.ppa.dto.GetArticleInfoDTO;
+import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.service.ArticleInfoService;
+import com.cslg.ppa.service.commom.DifyService;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.ObjectUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
+import org.springframework.util.CollectionUtils;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class GetCNIPRArticleService {
+    private final ArticleInfoService articleInfoService;
+    @Autowired
+    private DifyService difyService;
+
+    /***
+     * 中国知识产权网
+     * @param baseUrl
+     * @throws IOException
+     */
+    //    @Scheduled(cron = "0 0 2 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
+    public void crawCniprArticle(String baseUrl) throws IOException {
+        System.out.println(new Date() + "CNIPR-Begin");
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(20000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
+
+        // 改进的新闻列表抓取逻辑
+        Element container = doc.selectFirst("div.bd");
+        Elements linkElements = container.select("li");
+        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+        for (Element linkElement : linkElements) {
+            Element element = linkElement.getElementsByTag("a").first();
+            String link = element.absUrl("href");
+            // 如果absUrl没有返回有效链接,尝试其他方式
+            if (StringUtils.isEmpty(link)) {
+                link = element.attr("href");
+            }
+            if (link.startsWith("/")) {
+                // 提取基础URL的协议和域名
+                int endIndex = baseUrl.indexOf("/", 8); // 跳过http://或https://
+                String baseDomain = endIndex > 0 ? baseUrl.substring(0, endIndex) : baseUrl;
+                link = baseDomain + link;
+            } else if (!link.startsWith("http")) {
+                // 处理相对路径
+                int lastSlash = baseUrl.lastIndexOf("/");
+                String basePath = lastSlash > 0 ? baseUrl.substring(0, lastSlash + 1) : baseUrl + "/";
+                link = basePath + link;
+            }
+
+            try {
+                GetArticleInfoDTO articleInfoDTO = this.addCniprArticle(link);
+                if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
+                    articleInfoDTOS.add(articleInfoDTO);
+                }
+            } catch (Exception ignored) {
+            }
+        }
+        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+        System.out.println(new Date() + "CNIPR-Begin");
+    }
+
+    public GetArticleInfoDTO addCniprArticle(String baseUrl) throws IOException {
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(15000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
+
+        Element initElement = doc.selectFirst("div.zx_xqcont_cent");
+        Element dateElement = initElement.selectFirst("div.type1_btm");
+        String pubTime = dateElement.select("div.time").first().text();
+        String dateStr = RegexUtil.extractDate(pubTime);
+        String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+        if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+            return null;
+        }
+        Date date = new Date();
+        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+        try {
+            date = dateFormat.parse(dateStr);
+        } catch (Exception e) {
+            return null;
+        }
+
+        Element titleElement = initElement.selectFirst("div.xq_cont_title");
+        String title = titleElement.select("p").first().text();
+        // 检查文章是否已存在
+        ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+        if (ObjectUtils.isNotEmpty(articleInfo)) {
+            return null;
+        }
+
+        String digest = "";
+        List<String> list = new ArrayList<>();
+        Element element1 = doc.selectFirst("div.TRS_Editor");
+        Elements elements = element1.select("p");
+        for (Element element : elements) {
+            String text = element.text();
+            if (StringUtils.isNotEmpty(text)) {
+                list.add(text);
+            }
+        }
+        if (CollectionUtils.isEmpty(list)) {
+            String articleContent = element1.text();
+            list.add(articleContent);
+            digest = StringUtils.join(list, "\n");
+        } else {
+            digest = StringUtils.join(list, "\n");
+        }
+        String condensedAbstract = "";
+        try {
+            condensedAbstract = difyService.getCondensedAbstract(digest);
+        } catch (Exception e) {
+            condensedAbstract = "";
+        }
+        GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+        articleInfoDTO.setCategoryId(6);
+        articleInfoDTO.setSourceId(38);
+        articleInfoDTO.setArticleUrl(baseUrl);
+        articleInfoDTO.setTitle(title);
+        articleInfoDTO.setPublicTime(date);
+        articleInfoDTO.setDigest(condensedAbstract);
+        return articleInfoDTO;
+    }
+}

+ 515 - 60
src/main/java/com/cslg/ppa/service/GetWebArticle/GetProvinceNewsService.java

@@ -1,31 +1,48 @@
 package com.cslg.ppa.service.GetWebArticle;
 
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
 import com.cslg.ppa.common.utils.DateUtil;
+import com.cslg.ppa.common.utils.RegexUtil;
 import com.cslg.ppa.dto.GetArticleInfoDTO;
+import com.cslg.ppa.dto.HubeiArticleDTO;
 import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.entity.SourceInfo;
+import com.cslg.ppa.mapper.SourceInfoMapper;
+import com.cslg.ppa.service.ArticleInfoService;
 import com.cslg.ppa.service.commom.DifyService;
 import com.cslg.ppa.service.commom.XmlParseService;
+import com.google.gson.Gson;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
 import org.apache.commons.lang3.ObjectUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
+import org.apache.poi.hssf.record.DVALRecord;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
 import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
 import org.springframework.util.CollectionUtils;
 
 import java.io.IOException;
-import java.text.ParseException;
+import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Date;
-import java.util.List;
+import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -34,28 +51,94 @@ import java.util.stream.Collectors;
 @Service
 @RequiredArgsConstructor
 public class GetProvinceNewsService {
+    private final ArticleInfoService articleInfoService;
     private final XmlParseService xmlParseService;
     @Autowired
     private DifyService difyService;
+    @Autowired
+    private SourceInfoMapper sourceInfoMapper;
+
 
-    private static List<String> TitleList = Arrays.asList("要闻动态", "通知公告");
-    private static final Pattern DATE_PATTERN = Pattern.compile("(?<!\\d)(\\d{4}-\\d{2}-\\d{2})(?!\\d)");
+    private static List<Integer> SourceList = Arrays.asList(1);
+    private static List<String> TitleList = Arrays.asList("要闻动态", "通知公告", "公示公告", "公告");
     // 新闻动态项的CSS选择器 (通用配置,适用于大部分政府网站)
     private static final String NEWS_CONTAINER_SELECTOR = "div.con-right-list,div.main-content-right,div.tab-content,div.subpageCon-con,div.gl-main,div.nymain," +
-            "div.contain";
+            "div.contain,div.newsList,div.mainContent,div.gly,div.lists,div.mod_page_box,div.t_con,div.yearReport-con,div.innovate,ul.news-list," +
+            "div.top_right_con,div.listright,div.gl,div.list_container,div.zkmm_right,#subNavgsgg,div.content_right,div.r_box,div.m_t_12,div.zw-con,div.list-fr";
     private static final String TITLE_SELECTOR = "a";
     private static final String LINK_SELECTOR = "a[href]";
 
-    public void crawlArticles(String baseUrl) throws IOException {
+    /**
+     * 抓取国内各大省份地区相关知识产权局资讯
+     *
+     * @throws Exception
+     */
+//    @Scheduled(cron = "0 0 2 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
+    public void addArticleFromWebSource() throws Exception {
+        System.out.println(new Date() + "Province-Begin");
+        List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
+                .eq(SourceInfo::getSourceType, 1)
+                .eq(SourceInfo::getWebType, 2));
+        if (!CollectionUtils.isEmpty(sourceInfos)) {
+            for (SourceInfo sourceInfo : sourceInfos) {
+                try {
+                    Thread.sleep(1000);
+                } catch (Exception ignored) {
+
+                }
+                Integer id = sourceInfo.getId();
+                String sourceName = sourceInfo.getSourceName();
+                String sourceUrl = sourceInfo.getSourceUrl();
+                if (StringUtils.equals("河北省市场监督管理局(知识产权局)", sourceName)) {
+                    try {
+                        this.crawlHebeiArticles(sourceUrl, sourceName, id, 2);
+                    } catch (Exception e) {
+                        log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
+                    }
+                } else if (StringUtils.equals("江西省市场监督管理局(知识产权局)", sourceName)) {
+                    try {
+                        this.crawlJiangxiArticles(sourceUrl, sourceName, id, 2);
+                    } catch (Exception e) {
+                        log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
+                    }
+                } else if (StringUtils.equals("湖北省知识产权局", sourceName)) {
+                    try {
+                        this.crawlHubeiArticles(sourceUrl, sourceName, id, 2);
+                    } catch (Exception e) {
+                        log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
+                    }
+                } else {
+                    try {
+                        this.crawlArticles(sourceUrl, sourceName, id, 2);
+                    } catch (Exception e) {
+                        log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
+                    }
+                }
+            }
+        }
+        System.out.println(new Date() + "Province-Begin");
+    }
+
+    /**
+     * 通用性抓取各大省份知识产权局资讯
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void crawlArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
-                .timeout(15000) // 增加超时时间
+                .timeout(20000) // 增加超时时间
                 .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
                 .followRedirects(true) // 跟随重定向
                 .get();
 
         // 改进的新闻列表抓取逻辑
-        Elements newsItems  = doc.select(LINK_SELECTOR);
+        Elements newsItems = doc.select(LINK_SELECTOR);
         List<String> list = new ArrayList<>();
         for (Element item : newsItems) {
             try {
@@ -64,9 +147,14 @@ public class GetProvinceNewsService {
 
                 if (titleElement != null && linkElement != null) {
                     String title = titleElement.text().trim();
-                    if (TitleList.contains(title)) {
+                    if (baseUrl.contains("amr.hainan.gov.cn") && StringUtils.equals("通知公告", title)) {
                         String link = linkElement.absUrl("href"); // 获取绝对URL
                         list.add(link);
+                    } else {
+                        if (TitleList.contains(title) && !baseUrl.contains("amr.hainan.gov.cn")) {
+                            String link = linkElement.absUrl("href"); // 获取绝对URL
+                            list.add(link);
+                        }
                     }
                 }
             } catch (Exception e) {
@@ -76,13 +164,26 @@ public class GetProvinceNewsService {
         if (!CollectionUtils.isEmpty(list)) {
             List<String> collect = list.stream().distinct().collect(Collectors.toList());
             for (String url : collect) {
-                this.crawlArticlesDetail(url);
+                this.crawlArticlesDetail(url, sourceName, sourceId, type);
             }
         }
     }
 
-    public List<GetArticleInfoDTO> crawlArticlesDetail(String baseUrl) throws IOException {
-        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+    /**
+     * 抓取具体某个知识产权局过程详情
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void crawlArticlesDetail(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
+        try {
+            Thread.sleep(1000);
+        } catch (Exception ignored) {
+
+        }
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
                 .timeout(15000) // 增加超时时间
@@ -92,11 +193,13 @@ public class GetProvinceNewsService {
 
         // 改进的新闻列表抓取逻辑
         Element container = doc.selectFirst(NEWS_CONTAINER_SELECTOR);
+        if (container == null && baseUrl.contains("www.ahippc.cn")) {
+            container = doc.selectFirst("div.right");
+        }
         Element scriptElement = container.select("script").first();
         List<String> reStrs = new ArrayList<>();
         if (scriptElement == null) {
             Elements elements = container.select("li");
-//            List<String> collect = elements.stream().map(Node::outerHtml).collect(Collectors.toList());
             for (Element element : elements) {
                 String liTag = element.outerHtml().trim();
                 reStrs.add(liTag);
@@ -112,31 +215,57 @@ public class GetProvinceNewsService {
             }
         }
         if (CollectionUtils.isEmpty(reStrs)) {
-            Elements elements = container.select("li");
-//            List<String> collect = elements.stream().map(Node::outerHtml).collect(Collectors.toList());
+            Elements elements = null;
+            if (baseUrl.contains("www.ahippc.cn")) {
+                elements = container.select("dd.fix");
+            } else {
+                elements = container.select("li");
+            }
             for (Element element : elements) {
                 String liTag = element.outerHtml().trim();
                 reStrs.add(liTag);
             }
         }
-        int count = 1;
+
+        commonProvinceAddArticle(reStrs, baseUrl, sourceName, sourceId, type);
+    }
+
+    /**
+     * 将具体知识产权局相关资讯信息添加到资讯表中
+     *
+     * @param reStrs
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     */
+    public void commonProvinceAddArticle(List<String> reStrs, String baseUrl, String sourceName, Integer sourceId, Integer type) {
+        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+        int count = 0;
         for (String reStr : reStrs) {
             // 限制处理的新闻项数量,避免处理过多数据
-            if (count >= 10) {
-                log.info("已达到处理上限(30条),停止处理更多新闻项");
+            if (count > 20) {
+                log.info("已达到处理上限(20条),停止处理更多新闻项");
                 break;
             }
             Document document = Jsoup.parse(reStr);
             Element linkElement = document.getElementsByTag("a").first();
+            if (baseUrl.contains("scjg.jl.gov.cn")) {
+                Elements linkElements = document.getElementsByTag("a");
+                linkElement = linkElements.get(1);
+            }
             //获取时间
             String dateStr = document.getElementsByTag("span").text().trim();
-            if (baseUrl.contains("zjippc.org.cn")) {
-                dateStr = extractDate(dateStr);
+            if (baseUrl.contains("zjippc.org.cn") || baseUrl.contains("www.sxippc.com")) {
+                dateStr = RegexUtil.extractDate(dateStr);
+            } else if (baseUrl.contains("amr.hainan.gov.cn") || baseUrl.contains("scjg.jl.gov.cn")) {
+                dateStr = document.getElementsByTag("em").text().trim();
+                dateStr = getHainanDate(dateStr);
             }
             String yesterdayDateStr = DateUtil.getYesterdayDateStr();
-//            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-//                continue;
-//            }
+            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+                break;
+            }
             Date date = new Date();
             SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
             try {
@@ -146,14 +275,21 @@ public class GetProvinceNewsService {
             }
 
             String title = linkElement.text();
-            if (baseUrl.contains("zjippc.org.cn")) {
-                title = title.replace(dateStr, "");
+            if (baseUrl.contains("zjippc.org.cn") || baseUrl.contains("www.sxippc.com") || baseUrl.contains("hlipa.hlj.gov.cn") || baseUrl.contains("scjg.nx.gov.cn")) {
+                title = title.replace(dateStr, "").trim();
+            } else if (baseUrl.contains("amr.yn.gov.cn") || baseUrl.contains("amr.nmg.gov.cn") || baseUrl.contains("scjgj.qinghai.gov.cn")
+                    || baseUrl.contains("scjgj.sc.gov.cn") || baseUrl.contains("scjg.jl.gov.cn")
+                    || baseUrl.contains("amr.guizhou.gov.cn") || baseUrl.contains("zscq.tj.gov.cn")) {
+                title = linkElement.attr("title").trim();
+            }
+            if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
+                continue;
             }
             // 检查文章是否已存在
-//            ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
-//            if (ObjectUtils.isNotEmpty(articleInfo)) {
-//                continue; // 文章已存在,跳过
-//            }
+            ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+            if (ObjectUtils.isNotEmpty(articleInfo)) {
+                continue; // 文章已存在,跳过
+            }
 
             String link = linkElement.absUrl("href");
             // 如果absUrl没有返回有效链接,尝试其他方式
@@ -176,38 +312,47 @@ public class GetProvinceNewsService {
             try {
                 digest = this.getDigest(link);
             } catch (Exception e) {
-                System.out.println(e);
-                System.out.println(link);
-                System.out.println(title);
+                digest = "";
+
+            }
+            if (StringUtils.isEmpty(digest)) {
                 continue;
             }
-//            if (StringUtils.isEmpty(digest)) {
-//                continue;
-//            }
             String condensedAbstract = null;
-//            try {
-//                condensedAbstract = difyService.getCondensedAbstract(digest);
-//            } catch (Exception e) {
-//
-//            }
-//            if (StringUtils.isEmpty(condensedAbstract)) {
-//                continue;
-//            }
+            try {
+                condensedAbstract = difyService.getCondensedAbstract(digest);
+            } catch (Exception ignored) {
+
+            }
+            if (StringUtils.isEmpty(condensedAbstract)) {
+                continue;
+            }
             GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
-//            articleInfoDTO.setCategoryId(type);
-            articleInfoDTO.setSourceId(1);
+            articleInfoDTO.setCategoryId(type);
+            articleInfoDTO.setSourceId(sourceId);
             articleInfoDTO.setArticleUrl(link);
             articleInfoDTO.setTitle(title);
             articleInfoDTO.setPublicTime(date);
-            articleInfoDTO.setDigest(digest);
-//            articleInfoDTOS.add(articleInfoDTO);
+            articleInfoDTO.setDigest(condensedAbstract);
+            articleInfoDTOS.add(articleInfoDTO);
             count++;
         }
-//        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
-        return articleInfoDTOS;
+        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
     }
 
+    /**
+     * 获取资讯通知内容
+     *
+     * @param baseUrl
+     * @return
+     * @throws IOException
+     */
     public String getDigest(String baseUrl) throws IOException {
+        try {
+            Thread.sleep(1000);
+        } catch (Exception ignored) {
+
+        }
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
                 .timeout(15000) // 增加超时时间
@@ -235,19 +380,329 @@ public class GetProvinceNewsService {
         return content;
     }
 
-    public static String extractDate(String input) {
+    //获取浙江省知识产权保护中心资讯详情里的日期
+    public static String getHainanDate(String input) {
         if (input == null || input.trim().isEmpty()) {
             return null;
         }
+        // 正则表达式匹配YYYY-MM-DD格式
+        Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");
+        Matcher matcher = pattern.matcher(input);
+
+        String date = null;
+        if (matcher.find()) {
+            date = matcher.group();
+        }
+        return date;
+    }
+
+    private void setupHeaders(HttpGet request) {
+        request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0");
+        request.addHeader("Cookie", "lcid=1043; __jsluid_s=d8da8b71aed1f47e6d74773d87ebf074; _va_ref=%5B%22%22%2C%22%22%2C1724059829%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DuiYh-tKeu5W26B6buTNuL3NeDZ5sZfdDhBXd4R344papXwOqiQ_DhapSKUUahDaN%26wd%3D%26eqid%3Dc0d9d9b6002526d400000004668cdae9%22%5D; _va_id=f7c6b7152dd01f89.1720512427.4.1724059829.1724059829.; _va_ses=*");
+    }
+
+    /**
+     * 抓取湖北省知识产权局资讯
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void crawlHubeiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
+        String url = baseUrl + "fbjd/zc/qtzdgkwj/qtgkwj.json";
+        List<String> responseBodies = new ArrayList<>();
+        CloseableHttpClient httpClient = HttpClients.createDefault();
+        try {
+            HttpGet request = new HttpGet(url);
+            setupHeaders(request);
+            HttpResponse response = httpClient.execute(request);
+            String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
+            responseBodies.add(responseBody);
+            if (!responseBodies.isEmpty()) {
+                for (String res : responseBodies) {
+                    JSONObject object = JSONObject.parseObject(res);
+                    List<HubeiArticleDTO> list = JSONObject.parseArray(object.getString("data"), HubeiArticleDTO.class);
+                    List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+                    int count = 0;
+                    for (HubeiArticleDTO articleDTO : list) {
+                        // 限制处理的新闻项数量,避免处理过多数据
+                        if (count > 20) {
+                            log.info("已达到处理上限(20条),停止处理更多新闻项");
+                            break;
+                        }
+                        String docreltime = articleDTO.getDOCRELTIME();
+                        String dateStr = RegexUtil.extractDate(docreltime);
+                        String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+                        if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+                            break;
+                        }
+                        Date date = new Date();
+                        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+                        try {
+                            date = dateFormat.parse(dateStr);
+                        } catch (Exception e) {
+                            continue;
+                        }
+                        String title = articleDTO.getFILENAME();
+                        if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
+                            continue;
+                        }
+                        // 检查文章是否已存在
+                        ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+                        if (ObjectUtils.isNotEmpty(articleInfo)) {
+                            continue; // 文章已存在,跳过
+                        }
+                        String link = articleDTO.getURL();
+                        String digest = null;
+                        try {
+                            digest = this.getDigest(link);
+                        } catch (Exception e) {
+                            digest = "";
+                        }
+                        if (StringUtils.isEmpty(digest)) {
+                            continue;
+                        }
+                        String condensedAbstract = null;
+                        try {
+                            condensedAbstract = difyService.getCondensedAbstract(digest);
+                        } catch (Exception ignored) {
+
+                        }
+                        if (StringUtils.isEmpty(condensedAbstract)) {
+                            continue;
+                        }
+                        GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+                        articleInfoDTO.setCategoryId(type);
+                        articleInfoDTO.setSourceId(sourceId);
+                        articleInfoDTO.setArticleUrl(link);
+                        articleInfoDTO.setTitle(title);
+                        articleInfoDTO.setPublicTime(date);
+                        articleInfoDTO.setDigest(condensedAbstract);
+                        articleInfoDTOS.add(articleInfoDTO);
+                        count++;
+                    }
+                    articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+                }
+            }
+        } catch (Exception ignored) {
+
+        }
+    }
+
+    /**
+     * 抓取河北省知识产权局资讯
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void crawlHebeiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(20000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
+
+        // 改进的新闻列表抓取逻辑
+        Elements newsItems = doc.select(LINK_SELECTOR);
+        List<String> list = new ArrayList<>();
+        for (Element item : newsItems) {
+            try {
+                String href = item.attr("href");
+                if (StringUtils.equals(href, "/node/919")) {
+                    String s = item.absUrl("href");
+                    list.add(s);
+                }
+            } catch (Exception e) {
+                log.warn("解析单个新闻项时出错: ", e);
+            }
+        }
+        for (String url : list) {
+            addHebeiArticle(url, sourceName, sourceId, type);
+        }
+    }
+
+    /**
+     * 将河北知识产权局资讯添加到资讯表中
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void addHebeiArticle(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(15000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
 
-        Matcher matcher = DATE_PATTERN.matcher(input);
-        String lastMatch = null;
+        // 改进的新闻列表抓取逻辑
+        Element container = doc.selectFirst(NEWS_CONTAINER_SELECTOR);
+        Elements linkElements = container.select("a.zkmmr_tl1_item_a");
+        int count = 0;
+        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+        for (Element linkElement : linkElements) {
+            // 限制处理的新闻项数量,避免处理过多数据
+            if (count > 20) {
+                log.info("已达到处理上限(20条),停止处理更多新闻项");
+                break;
+            }
+            Element element = linkElement.select("p.zkmmr_tl1_item_date").first();
+            String dateStr = element.text().trim();
+            String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+                break;
+            }
+            Date date = new Date();
+            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+            try {
+                date = dateFormat.parse(dateStr);
+            } catch (Exception e) {
+                continue;
+            }
+
+            String title = linkElement.text().replace(dateStr, "").trim();
+            if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
+                continue;
+            }
+            // 检查文章是否已存在
+            ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+            if (ObjectUtils.isNotEmpty(articleInfo)) {
+                continue; // 文章已存在,跳过
+            }
+
+            String link = linkElement.absUrl("href");
+            String digest = null;
+            try {
+                digest = this.getDigest(link);
+            } catch (Exception e) {
+                digest = "";
+            }
+            if (StringUtils.isEmpty(digest)) {
+                continue;
+            }
+            String condensedAbstract = null;
+            try {
+                condensedAbstract = difyService.getCondensedAbstract(digest);
+            } catch (Exception e) {
 
-        // 查找所有匹配项并记录最后一个
-        while (matcher.find()) {
-            lastMatch = matcher.group(1);
+            }
+            if (StringUtils.isEmpty(condensedAbstract)) {
+                continue;
+            }
+            GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+            articleInfoDTO.setCategoryId(type);
+            articleInfoDTO.setSourceId(sourceId);
+            articleInfoDTO.setArticleUrl(link);
+            articleInfoDTO.setTitle(title);
+            articleInfoDTO.setPublicTime(date);
+            articleInfoDTO.setDigest(condensedAbstract);
+            articleInfoDTOS.add(articleInfoDTO);
+            count++;
         }
+        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+    }
+
+    /**
+     * 抓取新疆知识产权局资讯到咨询表中
+     *
+     * @param baseUrl
+     * @param sourceName
+     * @param sourceId
+     * @param type
+     * @throws Exception
+     */
+    public void crawlJiangxiArticles(String baseUrl, String sourceName, Integer sourceId, Integer type) throws Exception {
+        Map<String, Object> map = new HashMap<>();
+        String param = new Gson().toJson(map);
+        RequestBody requestBody = RequestBody.create(MediaType.parse("application/json"), param);
+        OkHttpClient okHttpClient = new OkHttpClient();
+        Request request = new Request.Builder()
+                .url("https://amr.jiangxi.gov.cn/queryList?current=1&unitid=368486&webSiteCode%5B%5D=amr&channelCode%5B%5D=tzgg&dataBefore=&dataAfter=&perPage=13&showMode=full&groupSize=1&barPosition=bottom&titleMax=34&templateContainerId=datalist&themeName=default&pageSize=13")
+                .post(requestBody)
+                .build();
+        String resBody = Objects.requireNonNull(okHttpClient.newCall(request).execute().body()).string();
+        JSONArray results = null;
+        try {
+            JSONObject jsonObject = JSONObject.parseObject(resBody);
+            JSONObject object = jsonObject.getJSONObject("data");
+            results = object.getJSONArray("results");
+            List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+            int count = 0;
+            for (int i = 0; i < results.size(); i++) {
+                // 限制处理的新闻项数量,避免处理过多数据
+                if (count > 20) {
+                    log.info("已达到处理上限(20条),停止处理更多新闻项");
+                    break;
+                }
+                JSONObject item = results.getJSONObject(i);
+                JSONObject source = item.getJSONObject("source");
 
-        return lastMatch;
+                String pubDate = source.get("pubDate").toString();
+                String dateStr = RegexUtil.extractDate(pubDate);
+                String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+                if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+                    break;
+                }
+                Date date = new Date();
+                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+                try {
+                    date = dateFormat.parse(dateStr);
+                } catch (Exception e) {
+                    continue;
+                }
+                String title = source.get("title").toString();
+                if (sourceName.contains("市场") && (!title.contains("知识产权") || !title.contains("知识"))) {
+                    continue;
+                }
+                // 检查文章是否已存在
+                ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+                if (ObjectUtils.isNotEmpty(articleInfo)) {
+                    continue; // 文章已存在,跳过
+                }
+                JSONObject urlObject = source.getJSONObject("urls");
+                String url = urlObject.get("pc").toString();
+                String link = baseUrl + url.substring(1);
+                String digest = null;
+                try {
+                    digest = this.getDigest(link);
+                } catch (Exception e) {
+                    digest = "";
+                }
+                if (StringUtils.isEmpty(digest)) {
+                    continue;
+                }
+                String condensedAbstract = null;
+                try {
+                    condensedAbstract = difyService.getCondensedAbstract(digest);
+                } catch (Exception ignored) {
+
+                }
+                if (StringUtils.isEmpty(condensedAbstract)) {
+                    continue;
+                }
+                GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+                articleInfoDTO.setCategoryId(type);
+                articleInfoDTO.setSourceId(sourceId);
+                articleInfoDTO.setArticleUrl(link);
+                articleInfoDTO.setTitle(title);
+                articleInfoDTO.setPublicTime(date);
+                articleInfoDTO.setDigest(condensedAbstract);
+                articleInfoDTOS.add(articleInfoDTO);
+                count++;
+            }
+            articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+        } catch (Exception ignored) {
+
+        }
     }
+
 }

+ 216 - 0
src/main/java/com/cslg/ppa/service/GetWebArticle/GetZhiNanZhenArticleService.java

@@ -0,0 +1,216 @@
+package com.cslg.ppa.service.GetWebArticle;
+
+import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
+import com.cslg.ppa.common.utils.DateUtil;
+import com.cslg.ppa.common.utils.RegexUtil;
+import com.cslg.ppa.dto.GetArticleInfoDTO;
+import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.service.ArticleInfoService;
+import com.cslg.ppa.service.commom.DifyService;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.ObjectUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
+import org.springframework.util.CollectionUtils;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+import java.util.stream.Collectors;
+
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class GetZhiNanZhenArticleService {
+    private final ArticleInfoService articleInfoService;
+    @Autowired
+    private DifyService difyService;
+
+    /**
+     * 智南针
+     *
+     * @param baseUrl
+     * @throws Exception
+     */
+    //    @Scheduled(cron = "0 0 2 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
+    public void crawZhiNanZhenArticle(String baseUrl) throws Exception {
+        System.out.println(new Date() + "ZhiNanZhen-Begin");
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(20000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
+        List<String> list = Arrays.asList("海外知识产权动态信息", "海外知识产权实务指引", "海外知识产权官费专栏");
+        // 遍历每个栏目并抓取资讯链接
+        for (String sectionTitle : list) {
+            extractSectionLinks(doc, sectionTitle);
+        }
+        System.out.println(new Date() + "ZhiNanZhen-End");
+    }
+
+    private void extractSectionLinks(Document doc, String sectionTitle) throws Exception {
+        List<String> links = new ArrayList<>();
+        // 选择所有section元素
+        Elements sections = doc.select("section.section");
+        for (Element section : sections) {
+            // 检查section的标题是否包含指定的栏目名称
+            Element header = section.selectFirst(".section-header h2");
+            if (header != null && header.text().contains(sectionTitle)) {
+                // 提取该section下的所有资讯链接
+                Elements newsItems = section.select("ul li a[href]");
+                for (Element item : newsItems) {
+                    String href = item.attr("href");
+                    // 确保href属性存在且不是空值,并且不是非资讯链接
+                    if (!href.isEmpty() && !isValidNewsLink(item,href,item.text())) {
+                        links.add(href);
+                    }
+                }
+            }
+        }
+
+        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+        if (!CollectionUtils.isEmpty(links)) {
+            for (String link : links) {
+                try {
+                    GetArticleInfoDTO articleInfoDTO = this.addZhiNanZhenArticle(link);
+                    if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
+                        articleInfoDTOS.add(articleInfoDTO);
+                    }
+                } catch (Exception ignored) {
+                }
+            }
+        }
+        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+    }
+
+    /**
+     * 添加智南针网资讯内容到资讯表中
+     * @param baseUrl
+     * @return
+     * @throws IOException
+     */
+    public GetArticleInfoDTO addZhiNanZhenArticle(String baseUrl) throws IOException {
+        // 使用Jsoup连接并解析网页
+        Document doc = Jsoup.connect(baseUrl)
+                .timeout(15000) // 增加超时时间
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                .followRedirects(true) // 跟随重定向
+                .get();
+
+        Element initElement = doc.selectFirst("div.con_right");
+        Element titleElement = initElement.selectFirst("h1");
+        //获取时间
+        String pubTime = titleElement.getElementsByTag("span").text().trim();
+        String dateStr = RegexUtil.extractDate(pubTime);
+        String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+        if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+            return null;
+        }
+        Date date = new Date();
+        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+        try {
+            date = dateFormat.parse(dateStr);
+        } catch (Exception e) {
+            return null;
+        }
+
+        String titleContent = titleElement.text();
+        String title = titleContent.substring(0, (titleContent.length() - 23)).trim();
+        // 检查文章是否已存在
+        ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
+        if (ObjectUtils.isNotEmpty(articleInfo)) {
+            return null;
+        }
+
+        String digest = "";
+        List<String> list = new ArrayList<>();
+        Element textElement = initElement.selectFirst("div.dl");
+        Elements elements = textElement.select("span");
+        for (Element element : elements) {
+            String text = element.text().trim();
+            if (StringUtils.isNotEmpty(text)) {
+                list.add(text);
+            }
+        }
+        List<String> collect = list.stream().distinct().collect(Collectors.toList());
+        if (CollectionUtils.isEmpty(collect)) {
+            String articleContent = initElement.text();
+            collect.add(articleContent);
+            digest = StringUtils.join(collect, "\n");
+        } else {
+            digest = StringUtils.join(collect, "\n");
+        }
+        String condensedAbstract = "";
+        try {
+            condensedAbstract = difyService.getCondensedAbstract(digest);
+        } catch (Exception e) {
+            condensedAbstract = "";
+        }
+        GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+        articleInfoDTO.setCategoryId(6);
+        articleInfoDTO.setSourceId(39);
+        articleInfoDTO.setArticleUrl(baseUrl);
+        articleInfoDTO.setTitle(title);
+        articleInfoDTO.setPublicTime(date);
+        articleInfoDTO.setDigest(condensedAbstract);
+        return articleInfoDTO;
+    }
+
+    // 智能判断资讯链接的核心方法
+    private static boolean isValidNewsLink(Element link, String href, String text) {
+        /**
+         * 过滤规则说明:
+         * 1. 排除空链接和片段标识符(#)
+         * 2. 排除JavaScript伪链接
+         * 3. 排除短文本链接(通常非资讯链接)
+         * 4. 排除PDF/DOC等非网页链接
+         * 5. 排除已知非资讯路径模式
+         */
+
+        // 规则1:基础有效性检查
+        if (href.isEmpty() || href.startsWith("#") || href.startsWith("javascript:")) {
+            return false;
+        }
+
+        // 规则2:文本长度过滤(非资讯链接通常文本较短)
+        if (text.length() < 5) return false;
+
+        // 规则3:文件类型过滤
+        String lowerHref = href.toLowerCase();
+        if (lowerHref.endsWith(".pdf") || lowerHref.endsWith(".doc")
+                || lowerHref.endsWith(".docx") || lowerHref.endsWith(".xls")) {
+            return false;
+        }
+
+        // 规则4:路径模式过滤(根据实际网站路径模式调整)
+        if (lowerHref.contains("/login/") || lowerHref.contains("/register/")
+                || lowerHref.contains("/help/") || lowerHref.contains("/contact/")) {
+            return false;
+        }
+
+        // 规则5:DOM结构验证(确保链接在资讯列表内)
+        Element parentLi = link.parent();
+        if (parentLi == null) return false;
+
+        Element parentUl = parentLi.parent();
+        if (parentUl == null) return false;
+
+        // 验证父级容器是否为资讯列表
+        return parentUl.hasClass("list-news")
+                || parentUl.hasClass("guides")
+                || parentUl.hasClass("fee-list");
+    }
+}

Разница между файлами не показана из-за своего большого размера
+ 2 - 2
src/main/resources/application-dev.yml


+ 80 - 4
src/test/java/com/cslg/ppa/PpaApplicationTests.java

@@ -110,6 +110,8 @@ class PpaApplicationTests {
 
 //        getLocalInformationService.getLocalInformation();
 
+//        getPCTArticleService.getPCTArticle();
+
         getWeChatArticleService.getWeChatArticle();
 
 //        final String digest = getCNIPAArticleService.getDigest("");
@@ -248,7 +250,7 @@ class PpaApplicationTests {
     }
 
     @Test
-    void crawlArticles() throws IOException {
+    void crawlArticles() throws Exception {
         //-------------江苏知识产权局
 //        String url = "https://jsip.jiangsu.gov.cn/";
 //        String url = "https://jsip.jiangsu.gov.cn/col/col75877/index.html";
@@ -265,9 +267,57 @@ class PpaApplicationTests {
 //        String url = "https://zscqj.cq.gov.cn/";
         //-------------天津知识产权局
 //        String url = "https://zscq.tj.gov.cn/";
-
-        String url = "https://www.2firsts.cn/";
-        getProvinceNewsService.crawlArticles(url);
+        //-------------河南知识产权局
+//        String url = "https://scjg.henan.gov.cn/hnzscqj/";
+        //-------------广东省知识产权保护中心
+//        String url = "https://www.gippc.com.cn/ippc/index.shtml";
+        //-------------海南知识产权局
+//        String url = "https://amr.hainan.gov.cn/szscqj/";
+        //-------------湖南市场监督管理局
+//        String url = "https://amr.hunan.gov.cn/";
+        //-------------辽宁省知识产权局
+//        String url = "https://zscq.ln.gov.cn/zscq/index/index.shtml";
+        //-------------青海省知识产权局(青海省市场监督管理局)
+//        String url = "https://scjgj.qinghai.gov.cn/";
+        //-------------陕西省知识产权局
+//        String url = "https://snipa.shaanxi.gov.cn/";
+        //-------------山西省知识产权保护中心
+//        String url = "https://www.sxippc.com/";
+        //-------------云南知识产权局(云南省市场监督管理局)
+//        String url = "https://amr.yn.gov.cn/zscqj/index.htm";
+        //-------------内蒙古市场监督管理局
+//        String url = "https://amr.nmg.gov.cn/";
+        //-------------安徽省知识产权保护中心
+//        String url = "https://www.ahippc.cn/";
+        //-------------湖北知识产权局
+//        String url = "https://zscqj.hubei.gov.cn/";
+//        getProvinceNewsService.crawlHubeiArticles(url);
+        //-------------江西省市场监督管理局(知识产权局)
+//        String url = "https://amr.jiangxi.gov.cn/";
+        //-------------黑龙江知识产权局
+//        String url = "https://hlipa.hlj.gov.cn/hlipa/index.shtml";
+        //-------------福建省知识产权局(福建省市场监督管理局)
+//        String url = "https://scjgj.fujian.gov.cn/";
+        //-------------四川省知识产权局(四川省市场监督管理局)
+//        String url = "https://scjgj.sc.gov.cn/";
+        //-------------河北知识产权局(河北市场监督管理局)
+//        String url = "https://scjg.hebei.gov.cn/";
+        //-------------吉林省市场监督管理厅
+//        String url = "http://scjg.jl.gov.cn/";
+        //-------------山东省市场监督管理局
+//        String url = "http://amr.shandong.gov.cn/";
+        //-------------贵州省市场监督管理局(贵州知识产权局)
+//        String url = "https://amr.guizhou.gov.cn/";
+        //-------------甘肃市场监督管理局
+//        String url = "https://scjg.gansu.gov.cn/scjg/index.shtml";
+        //-------------广西知识产权公告服务平台
+//        String url = "http://www.gxipo.net/";
+        //-------------宁夏市场监督管理局
+//        String url = "http://scjg.nx.gov.cn/";
+        //-------------新疆市场监督管理局(新疆知识产权局)
+        String url = "https://scjgj.xinjiang.gov.cn/";
+        getProvinceNewsService.crawlArticles(url,"",1,2);
+//        String url = "https://www.ahippc.cn/news.html?categoryId=a5e96b641ade4fc9b50b4f9504ba0f62";
 //        final List<GetArticleInfoDTO> articleInfoDTOS = getProvinceNewsService.crawlArticlesDetail(url);
 //        System.out.println(articleInfoDTOS);
 //        String url = "https://jsip.jiangsu.gov.cn/art/2025/8/28/art_75877_11630402.html";
@@ -276,6 +326,19 @@ class PpaApplicationTests {
     }
 
     @Test
+    void addArticleFromWebSource() throws Exception {
+//        String url = "https://scjg.hebei.gov.cn/";
+//        getProvinceNewsService.crawlHebeiArticles(url);
+//        String url = "https://scjg.hebei.gov.cn/node/919";
+//        getProvinceNewsService.addHebeiArticle(url);
+//        String url = "https://zscqj.hubei.gov.cn/";
+//        getProvinceNewsService.crawlHubeiArticles(url);
+//        getProvinceNewsService.crawlJiangxiArticles("https://amr.jiangxi.gov.cn/");
+        getProvinceNewsService.addArticleFromWebSource();
+
+    }
+
+    @Test
     void crawlEcigaretteArticles() throws IOException {
         String url = "https://www.2firsts.cn/";
         getEcigaretteService.crawlEcigaretteArticles(url);
@@ -317,6 +380,19 @@ class PpaApplicationTests {
     @Test
     void getPCTArticle() {
         getPCTArticleService.getPCTArticle();
+    }
+
+    @Test
+    void test111() throws Exception {
+        //-------------中国知识产权网
+//        String url = "http://www.cnipr.com/";
+//        getProvinceNewsService.test(url);
+        //-------------智南针
+        String url = "https://www.worldip.cn/";
+        getProvinceNewsService.test1(url);
+
+//        String url = "https://www.worldip.cn/index.php?m=content&c=index&a=show&catid=64&id=2996";
+//        getProvinceNewsService.getDigest2(url);
 
     }
 }