zero пре 1 недеља
родитељ
комит
8760aad0e3

+ 7 - 7
src/main/java/com/cslg/ppa/service/GetWebArticle/GetCNIPAArticleService.java

@@ -1,6 +1,7 @@
 package com.cslg.ppa.service.GetWebArticle;
 
 
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
 import com.cslg.ppa.common.utils.DateUtil;
 import com.cslg.ppa.dto.GetArticleInfoDTO;
@@ -46,7 +47,7 @@ public class GetCNIPAArticleService {
     /**
      * 国家知识产权局
      */
-    @Scheduled(cron = "0 0 2 * * ?")
+    @Scheduled(cron = "0 0 1 * * ?")
     @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
     public void getCNIPA(){
         System.out.println(new Date() + "CNIPA-Begin");
@@ -95,7 +96,7 @@ public class GetCNIPAArticleService {
                 String dateStr = document.getElementsByTag("span").text().trim();
                 String yesterdayDateStr = DateUtil.getYesterdayDateStr();
                 if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-                    continue;
+                    break;
                 }
                 Date date = new Date();
                 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
@@ -109,8 +110,9 @@ public class GetCNIPAArticleService {
                 String title = linkElement.text();
 
                 // 检查文章是否已存在
-                ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
-                if (ObjectUtils.isNotEmpty(articleInfo)) {
+                long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
+                        .eq(ArticleInfo::getTitle, title));
+                if (count > 0) {
                     continue; // 文章已存在,跳过
                 }
 
@@ -124,11 +126,9 @@ public class GetCNIPAArticleService {
                     continue;
                 }
                 String condensedAbstract = null;
-//                String pctCondensedAbstract = null;
                 try {
                     condensedAbstract = difyService.getCondensedAbstract(digest);
-//                    pctCondensedAbstract = difyService.getPctCondensedAbstract(digest);
-                } catch (Exception e) {
+                } catch (Exception ignored) {
 
                 }
                 if (StringUtils.isEmpty(condensedAbstract)) {

+ 75 - 11
src/main/java/com/cslg/ppa/service/GetWebArticle/GetCNIPRArticleService.java

@@ -1,10 +1,13 @@
 package com.cslg.ppa.service.GetWebArticle;
 
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
 import com.cslg.ppa.common.utils.DateUtil;
 import com.cslg.ppa.common.utils.RegexUtil;
 import com.cslg.ppa.dto.GetArticleInfoDTO;
 import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.entity.SourceInfo;
+import com.cslg.ppa.mapper.SourceInfoMapper;
 import com.cslg.ppa.service.ArticleInfoService;
 import com.cslg.ppa.service.commom.DifyService;
 import lombok.RequiredArgsConstructor;
@@ -16,6 +19,7 @@ import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Propagation;
 import org.springframework.transaction.annotation.Transactional;
@@ -26,6 +30,7 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
+import java.util.stream.Collectors;
 
 @Slf4j
 @Service
@@ -34,16 +39,72 @@ public class GetCNIPRArticleService {
     private final ArticleInfoService articleInfoService;
     @Autowired
     private DifyService difyService;
+    @Autowired
+    private SourceInfoMapper sourceInfoMapper;
+
+    private static final String LINK_SELECTOR = "a[href]";
+    private static final String TITLE_SELECTOR = "a";
 
     /***
      * 中国知识产权网
-     * @param baseUrl
      * @throws IOException
      */
-    //    @Scheduled(cron = "0 0 2 * * ?")
-    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
-    public void crawCniprArticle(String baseUrl) throws IOException {
+    @Scheduled(cron = "0 0 4 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
+    public void crawCniprArticle() throws IOException {
         System.out.println(new Date() + "CNIPR-Begin");
+        SourceInfo sourceInfo = sourceInfoMapper.selectOne(new LambdaQueryWrapper<SourceInfo>()
+                .eq(SourceInfo::getWebType, 3));
+        if (ObjectUtils.isNotEmpty(sourceInfo)) {
+            String baseUrl = sourceInfo.getSourceUrl();
+            Integer sourceInfoId = sourceInfo.getId();
+            List<String> list = new ArrayList<>();
+            try {
+                Thread.sleep(1000);
+                // 使用Jsoup连接并解析网页
+                Document doc = Jsoup.connect(baseUrl)
+                        .timeout(20000) // 增加超时时间
+                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                        .followRedirects(true) // 跟随重定向
+                        .get();
+
+                // 改进的新闻列表抓取逻辑
+                Elements newsItems = doc.select(LINK_SELECTOR);
+                for (Element item : newsItems) {
+                    try {
+                        Element titleElement = item.selectFirst(TITLE_SELECTOR);
+                        Element linkElement = item.selectFirst(LINK_SELECTOR);
+
+                        if (titleElement != null && linkElement != null) {
+                            String title = titleElement.text().trim();
+                            if (StringUtils.equals("通知公告", title) || StringUtils.equals("新闻资讯", title)) {
+                                String link = linkElement.absUrl("href"); // 获取绝对URL
+                                list.add(link);
+                            }
+                        }
+                    } catch (Exception e) {
+                        log.warn("解析单个新闻项时出错: ", e);
+                    }
+                }
+            } catch (Exception ignored) {
+
+            }
+
+            if (!CollectionUtils.isEmpty(list)) {
+                List<String> collect = list.stream().distinct().collect(Collectors.toList());
+                for (String url : collect) {
+                    try {
+                        this.crawCniprArticleDetail(url, sourceInfoId, 6);
+                    } catch (IOException ignored) {
+
+                    }
+                }
+            }
+        }
+        System.out.println(new Date() + "CNIPR-End");
+    }
+
+    public void crawCniprArticleDetail(String baseUrl,Integer sourceId, Integer type) throws IOException {
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
                 .timeout(20000) // 增加超时时间
@@ -57,6 +118,9 @@ public class GetCNIPRArticleService {
         List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
         for (Element linkElement : linkElements) {
             Element element = linkElement.getElementsByTag("a").first();
+            if (element == null) {
+                continue;
+            }
             String link = element.absUrl("href");
             // 如果absUrl没有返回有效链接,尝试其他方式
             if (StringUtils.isEmpty(link)) {
@@ -75,7 +139,7 @@ public class GetCNIPRArticleService {
             }
 
             try {
-                GetArticleInfoDTO articleInfoDTO = this.addCniprArticle(link);
+                GetArticleInfoDTO articleInfoDTO = this.addCniprArticle(link, sourceId, type);
                 if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
                     articleInfoDTOS.add(articleInfoDTO);
                 }
@@ -83,10 +147,9 @@ public class GetCNIPRArticleService {
             }
         }
         articleInfoService.batchAddArticleInfo(articleInfoDTOS);
-        System.out.println(new Date() + "CNIPR-Begin");
     }
 
-    public GetArticleInfoDTO addCniprArticle(String baseUrl) throws IOException {
+    public GetArticleInfoDTO addCniprArticle(String baseUrl, Integer sourceId, Integer type) throws IOException {
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
                 .timeout(15000) // 增加超时时间
@@ -113,8 +176,9 @@ public class GetCNIPRArticleService {
         Element titleElement = initElement.selectFirst("div.xq_cont_title");
         String title = titleElement.select("p").first().text();
         // 检查文章是否已存在
-        ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
-        if (ObjectUtils.isNotEmpty(articleInfo)) {
+        long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
+                .eq(ArticleInfo::getTitle, title));
+        if (count > 0) {
             return null;
         }
 
@@ -142,8 +206,8 @@ public class GetCNIPRArticleService {
             condensedAbstract = "";
         }
         GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
-        articleInfoDTO.setCategoryId(6);
-        articleInfoDTO.setSourceId(38);
+        articleInfoDTO.setCategoryId(type);
+        articleInfoDTO.setSourceId(sourceId);
         articleInfoDTO.setArticleUrl(baseUrl);
         articleInfoDTO.setTitle(title);
         articleInfoDTO.setPublicTime(date);

+ 211 - 0
src/main/java/com/cslg/ppa/service/GetWebArticle/GetIprDailyArticleService.java

@@ -0,0 +1,211 @@
+package com.cslg.ppa.service.GetWebArticle;
+
+import com.alibaba.fastjson.JSONObject;
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
+import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
+import com.cslg.ppa.common.utils.DateUtil;
+import com.cslg.ppa.dto.GetArticleInfoDTO;
+import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.entity.SourceInfo;
+import com.cslg.ppa.mapper.SourceInfoMapper;
+import com.cslg.ppa.service.ArticleInfoService;
+import com.cslg.ppa.service.commom.DifyService;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import org.apache.commons.lang3.ObjectUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Service;
+import org.springframework.transaction.annotation.Propagation;
+import org.springframework.transaction.annotation.Transactional;
+import org.springframework.util.CollectionUtils;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+
+@Slf4j
+@Service
+@RequiredArgsConstructor
+public class GetIprDailyArticleService {
+    private final ArticleInfoService articleInfoService;
+    @Autowired
+    private DifyService difyService;
+    @Autowired
+    private SourceInfoMapper sourceInfoMapper;
+
+    private static final String BaseUrl = "http://www.iprdaily.cn/";
+    private static final Integer BaseId = 40;
+
+    /**
+     * IprDaily中文网
+     * @throws Exception
+     */
+//    @Scheduled(cron = "0 15 0 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
+    public void crawIprDailyArticle() throws Exception {
+        System.out.println(new Date() + "IprDaily-Begin");
+        Map<String, Integer> urlMap = new HashMap<>();
+        //案例
+        urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=9", 3);
+        //行业----诉讼
+        urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=11", 5);
+        //行业----国际视野
+        urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=15", 5);
+        //行业----行业
+        urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=8", 5);
+        for (String key : urlMap.keySet()) {
+            Integer type = urlMap.get(key);
+            try {
+                OkHttpClient okHttpClient = new OkHttpClient.Builder()
+                        .connectTimeout(60, TimeUnit.SECONDS)
+                        .writeTimeout(60, TimeUnit.SECONDS)
+                        .readTimeout(60, TimeUnit.SECONDS)
+                        .build();
+                Request request = new Request.Builder()
+                        .url(key)
+                        .get()
+                        .build();
+                String res = Objects.requireNonNull(okHttpClient.newCall(request).execute().body()).string();
+                JSONObject parseObject = JSONObject.parseObject(res);
+                String htmlContent = parseObject.getString("msg");
+//            final Integer code = parseObject.getInteger("code");
+                this.addIprDailyArticle(htmlContent, BaseUrl,BaseId,type);
+            } catch (IOException e) {
+                System.out.println("Get IPRDaily Web Article Error:" + e);
+            }
+        }
+        System.out.println(new Date() + "IprDaily-End");
+    }
+
+    public void addIprDailyArticle(String htmlContent, String baseUrl,Integer sourceId, Integer type) throws IOException {
+        Document doc = Jsoup.parse(htmlContent);
+        // 解析每个li标签
+        Elements liElements = doc.select("li.box-list");
+        List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
+        for (Element liElement : liElements) {
+            // 提取time中的日期
+            Element timeElement = liElement.selectFirst("dd.time");
+            String dateStr = timeElement.text();
+            String yesterdayDateStr = DateUtil.getYesterdayDateStr();
+            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+                break;
+            }
+            Date date = new Date();
+            SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+            try {
+                date = dateFormat.parse(dateStr);
+            } catch (Exception e) {
+                continue;
+            }
+
+            // 提取首个title
+            Element titleElement = liElement.selectFirst("dt.title");
+            String title = titleElement.text();
+            // 检查文章是否已存在
+            long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
+                    .eq(ArticleInfo::getTitle, title));
+            if (count > 0) {
+                continue; // 文章已存在,跳过
+            }
+
+            Element imgBox = liElement.selectFirst("a.img-box");
+            String link = imgBox.attr("href");
+            if (StringUtils.isEmpty(link)) {
+                link = imgBox.attr("href");
+            }
+            if (link.startsWith("/")) {
+                // 提取基础URL的协议和域名
+                int endIndex = baseUrl.indexOf("/", 8); // 跳过http://或https://
+                String baseDomain = endIndex > 0 ? baseUrl.substring(0, endIndex) : baseUrl;
+                link = baseDomain + link;
+            } else if (!link.startsWith("http")) {
+                // 处理相对路径
+                int lastSlash = baseUrl.lastIndexOf("/");
+                String basePath = lastSlash > 0 ? baseUrl.substring(0, lastSlash + 1) : baseUrl + "/";
+                link = basePath + link;
+            }
+            String digest = null;
+            try {
+                digest = this.getDigest(link);
+            } catch (Exception e) {
+                digest = "";
+            }
+            if (StringUtils.isEmpty(digest)) {
+                continue;
+            }
+            String condensedAbstract = null;
+            try {
+                condensedAbstract = difyService.getCondensedAbstract(digest);
+            } catch (Exception ignored) {
+
+            }
+            if (StringUtils.isEmpty(condensedAbstract)) {
+                continue;
+            }
+            GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
+            articleInfoDTO.setCategoryId(type);
+            articleInfoDTO.setSourceId(sourceId);
+            articleInfoDTO.setArticleUrl(link);
+            articleInfoDTO.setTitle(title);
+            articleInfoDTO.setPublicTime(date);
+            articleInfoDTO.setDigest(condensedAbstract);
+            articleInfoDTOS.add(articleInfoDTO);
+        }
+        articleInfoService.batchAddArticleInfo(articleInfoDTOS);
+    }
+
+    public String getDigest(String url) throws IOException {
+        String digest = "";
+        CloseableHttpClient httpClient = HttpClients.createDefault();
+        HttpGet request = new HttpGet(url);
+        HttpResponse response = httpClient.execute(request);
+        String responseBody = EntityUtils.toString(response.getEntity());
+        httpClient.close();
+        if (responseBody != null) {
+            digest = this.readJson(responseBody);
+        }
+        return digest;
+    }
+
+    public String readJson(String responseBody) {
+        String content = "";
+        try {
+            //使用JSoup解析HTML
+            Document doc = Jsoup.parse(responseBody);
+            // 获取description元数据
+            Element metaElement = doc.selectFirst("dl.article-con");
+            Elements elements = metaElement.select("p");
+            List<String> list = new ArrayList<>();
+            for (Element element : elements) {
+                String text = element.text();
+                if (StringUtils.isNotEmpty(text)) {
+                    list.add(text);
+                }
+            }
+            if (CollectionUtils.isEmpty(list)) {
+                String articleContent = elements.text();
+                list.add(articleContent);
+                content = StringUtils.join(list, "\n");
+            } else {
+                content = StringUtils.join(list, "\n");
+            }
+        } catch (Exception e) {
+            content = "";
+        }
+        return content;
+    }
+}

+ 1 - 1
src/main/java/com/cslg/ppa/service/GetWebArticle/GetLocalInformationService.java

@@ -22,7 +22,7 @@ import java.util.*;
 public class GetLocalInformationService {
     private final GetCNIPAArticleService getCNIPAArticleService;
 
-    @Scheduled(cron = "0 0 3 * * ?")
+    @Scheduled(cron = "0 0 2 * * ?")
     @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
     public void getLocalInformation(){
         System.out.println(new Date() + "Local-Begin");

+ 1 - 1
src/main/java/com/cslg/ppa/service/GetWebArticle/GetPCTArticleService.java

@@ -23,7 +23,7 @@ import java.util.Map;
 public class GetPCTArticleService {
     private final GetCNIPAArticleService getCNIPAArticleService;
 
-    @Scheduled(cron = "0 30 2 * * ?")
+    @Scheduled(cron = "0 30 1 * * ?")
     @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
     public void getPCTArticle(){
         System.out.println(new Date() + "PCT-Begin");

+ 15 - 14
src/main/java/com/cslg/ppa/service/GetWebArticle/GetProvinceNewsService.java

@@ -58,7 +58,6 @@ public class GetProvinceNewsService {
     @Autowired
     private SourceInfoMapper sourceInfoMapper;
 
-
     private static List<String> TitleList = Arrays.asList("要闻动态", "通知公告", "公示公告", "公告");
     // 新闻动态项的CSS选择器 (通用配置,适用于大部分政府网站)
     private static final String NEWS_CONTAINER_SELECTOR = "div.con-right-list,div.main-content-right,div.tab-content,div.subpageCon-con,div.gl-main,div.nymain," +
@@ -80,6 +79,7 @@ public class GetProvinceNewsService {
                 .eq(SourceInfo::getSourceType, 1)
                 .eq(SourceInfo::getWebType, 2));
         if (!CollectionUtils.isEmpty(sourceInfos)) {
+            int sum = 0;
             for (SourceInfo sourceInfo : sourceInfos) {
                 try {
                     Thread.sleep(1000);
@@ -114,7 +114,9 @@ public class GetProvinceNewsService {
                         log.warn("解析" + sourceName + "地区新闻项时出错: ", e);
                     }
                 }
+                sum++;
             }
+            System.out.println("Finished Num" + sum + "Time");
         }
         System.out.println(new Date() + "Province-Begin");
     }
@@ -262,9 +264,9 @@ public class GetProvinceNewsService {
                 dateStr = getHainanDate(dateStr);
             }
             String yesterdayDateStr = DateUtil.getYesterdayDateStr();
-            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-                break;
-            }
+//            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+//                break;
+//            }
             Date date = new Date();
             SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
             try {
@@ -435,9 +437,9 @@ public class GetProvinceNewsService {
                         String docreltime = articleDTO.getDOCRELTIME();
                         String dateStr = RegexUtil.extractDate(docreltime);
                         String yesterdayDateStr = DateUtil.getYesterdayDateStr();
-                        if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-                            break;
-                        }
+//                        if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+//                            break;
+//                        }
                         Date date = new Date();
                         SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
                         try {
@@ -559,9 +561,9 @@ public class GetProvinceNewsService {
             Element element = linkElement.select("p.zkmmr_tl1_item_date").first();
             String dateStr = element.text().trim();
             String yesterdayDateStr = DateUtil.getYesterdayDateStr();
-            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-                break;
-            }
+//            if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+//                break;
+//            }
             Date date = new Date();
             SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
             try {
@@ -581,7 +583,6 @@ public class GetProvinceNewsService {
                 continue; // 文章已存在,跳过
             }
 
-
             String link = linkElement.absUrl("href");
             String digest = null;
             try {
@@ -652,9 +653,9 @@ public class GetProvinceNewsService {
                 String pubDate = source.get("pubDate").toString();
                 String dateStr = RegexUtil.extractDate(pubDate);
                 String yesterdayDateStr = DateUtil.getYesterdayDateStr();
-                if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
-                    break;
-                }
+//                if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
+//                    break;
+//                }
                 Date date = new Date();
                 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
                 try {

+ 5 - 4
src/main/java/com/cslg/ppa/service/GetWebArticle/GetWeChatArticleService.java

@@ -70,7 +70,7 @@ public class GetWeChatArticleService {
     @Autowired
     private FileManagerService fileManagerService;
 
-    @Scheduled(cron = "0 0 4 * * ?")
+    @Scheduled(cron = "0 0 3 * * ?")
     @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
     public void getWeChatArticle() throws Exception {
         System.out.println(new Date() + "Wechat-Begin");
@@ -88,7 +88,7 @@ public class GetWeChatArticleService {
                 String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
                 String yesterdayDateStr = DateUtil.getYesterdayDateStr();
                 if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
-                    continue;
+                    break;
                 }
                 Date createTime = new Date();
                 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
@@ -103,8 +103,9 @@ public class GetWeChatArticleService {
 
 
                 // 检查文章是否已存在
-                ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
-                if (ObjectUtils.isNotEmpty(articleInfo)) {
+                long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
+                        .eq(ArticleInfo::getTitle, title));
+                if (sum > 0) {
                     continue; // 文章已存在,跳过
                 }
 

+ 35 - 22
src/main/java/com/cslg/ppa/service/GetWebArticle/GetZhiNanZhenArticleService.java

@@ -1,10 +1,13 @@
 package com.cslg.ppa.service.GetWebArticle;
 
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
 import com.cslg.ppa.common.utils.DateUtil;
 import com.cslg.ppa.common.utils.RegexUtil;
 import com.cslg.ppa.dto.GetArticleInfoDTO;
 import com.cslg.ppa.entity.ArticleInfo;
+import com.cslg.ppa.entity.SourceInfo;
+import com.cslg.ppa.mapper.SourceInfoMapper;
 import com.cslg.ppa.service.ArticleInfoService;
 import com.cslg.ppa.service.commom.DifyService;
 import lombok.RequiredArgsConstructor;
@@ -16,6 +19,7 @@ import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Propagation;
 import org.springframework.transaction.annotation.Transactional;
@@ -36,32 +40,39 @@ public class GetZhiNanZhenArticleService {
     private final ArticleInfoService articleInfoService;
     @Autowired
     private DifyService difyService;
+    @Autowired
+    private SourceInfoMapper sourceInfoMapper;
 
     /**
      * 智南针
      *
-     * @param baseUrl
      * @throws Exception
      */
-    //    @Scheduled(cron = "0 0 2 * * ?")
-    @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
-    public void crawZhiNanZhenArticle(String baseUrl) throws Exception {
+    @Scheduled(cron = "0 30 4 * * ?")
+    @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
+    public void crawZhiNanZhenArticle() throws Exception {
         System.out.println(new Date() + "ZhiNanZhen-Begin");
-        // 使用Jsoup连接并解析网页
-        Document doc = Jsoup.connect(baseUrl)
-                .timeout(20000) // 增加超时时间
-                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
-                .followRedirects(true) // 跟随重定向
-                .get();
-        List<String> list = Arrays.asList("海外知识产权动态信息", "海外知识产权实务指引", "海外知识产权官费专栏");
-        // 遍历每个栏目并抓取资讯链接
-        for (String sectionTitle : list) {
-            extractSectionLinks(doc, sectionTitle);
+        SourceInfo sourceInfo = sourceInfoMapper.selectOne(new LambdaQueryWrapper<SourceInfo>()
+                .eq(SourceInfo::getWebType, 4));
+        if (ObjectUtils.isNotEmpty(sourceInfo)) {
+            String sourceUrl = sourceInfo.getSourceUrl();
+            Integer sourceInfoId = sourceInfo.getId();
+            // 使用Jsoup连接并解析网页
+            Document doc = Jsoup.connect(sourceUrl)
+                    .timeout(20000) // 增加超时时间
+                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36")
+                    .followRedirects(true) // 跟随重定向
+                    .get();
+            List<String> list = Arrays.asList("海外知识产权动态信息", "海外知识产权实务指引", "海外知识产权官费专栏");
+            // 遍历每个栏目并抓取资讯链接
+            for (String sectionTitle : list) {
+                this.extractSectionLinks(doc, sectionTitle, sourceInfoId, 11);
+            }
         }
         System.out.println(new Date() + "ZhiNanZhen-End");
     }
 
-    private void extractSectionLinks(Document doc, String sectionTitle) throws Exception {
+    private void extractSectionLinks(Document doc, String sectionTitle, Integer sourceId, Integer type) throws Exception {
         List<String> links = new ArrayList<>();
         // 选择所有section元素
         Elements sections = doc.select("section.section");
@@ -74,7 +85,7 @@ public class GetZhiNanZhenArticleService {
                 for (Element item : newsItems) {
                     String href = item.attr("href");
                     // 确保href属性存在且不是空值,并且不是非资讯链接
-                    if (!href.isEmpty() && !isValidNewsLink(item,href,item.text())) {
+                    if (!href.isEmpty() && !isValidNewsLink(item, href, item.text())) {
                         links.add(href);
                     }
                 }
@@ -85,7 +96,7 @@ public class GetZhiNanZhenArticleService {
         if (!CollectionUtils.isEmpty(links)) {
             for (String link : links) {
                 try {
-                    GetArticleInfoDTO articleInfoDTO = this.addZhiNanZhenArticle(link);
+                    GetArticleInfoDTO articleInfoDTO = this.addZhiNanZhenArticle(link, sourceId, type);
                     if (ObjectUtils.isNotEmpty(articleInfoDTO)) {
                         articleInfoDTOS.add(articleInfoDTO);
                     }
@@ -98,11 +109,12 @@ public class GetZhiNanZhenArticleService {
 
     /**
      * 添加智南针网资讯内容到资讯表中
+     *
      * @param baseUrl
      * @return
      * @throws IOException
      */
-    public GetArticleInfoDTO addZhiNanZhenArticle(String baseUrl) throws IOException {
+    public GetArticleInfoDTO addZhiNanZhenArticle(String baseUrl, Integer sourceId, Integer type) throws IOException {
         // 使用Jsoup连接并解析网页
         Document doc = Jsoup.connect(baseUrl)
                 .timeout(15000) // 增加超时时间
@@ -130,8 +142,9 @@ public class GetZhiNanZhenArticleService {
         String titleContent = titleElement.text();
         String title = titleContent.substring(0, (titleContent.length() - 23)).trim();
         // 检查文章是否已存在
-        ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
-        if (ObjectUtils.isNotEmpty(articleInfo)) {
+        long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
+                .eq(ArticleInfo::getTitle, title));
+        if (sum > 0) {
             return null;
         }
 
@@ -160,8 +173,8 @@ public class GetZhiNanZhenArticleService {
             condensedAbstract = "";
         }
         GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
-        articleInfoDTO.setCategoryId(6);
-        articleInfoDTO.setSourceId(39);
+        articleInfoDTO.setCategoryId(type);
+        articleInfoDTO.setSourceId(sourceId);
         articleInfoDTO.setArticleUrl(baseUrl);
         articleInfoDTO.setTitle(title);
         articleInfoDTO.setPublicTime(date);

Разлика између датотеке није приказан због своје велике величине
+ 2 - 2
src/main/resources/application-dev.yml


+ 22 - 10
src/test/java/com/cslg/ppa/PpaApplicationTests.java

@@ -89,6 +89,12 @@ class PpaApplicationTests {
     private FileManagerService fileManagerService;
     @Autowired
     private GetPCTArticleService getPCTArticleService;
+    @Autowired
+    private GetIprDailyArticleService getIprDailyArticleService;
+    @Autowired
+    private GetZhiNanZhenArticleService getZhiNanZhenArticleService;
+    @Autowired
+    private GetCNIPRArticleService getCNIPRArticleService;
 
     @Test
     void contextLoads() {
@@ -268,7 +274,7 @@ class PpaApplicationTests {
         //-------------天津知识产权局
 //        String url = "https://zscq.tj.gov.cn/";
         //-------------河南知识产权局
-//        String url = "https://scjg.henan.gov.cn/hnzscqj/";
+        String url = "https://scjg.henan.gov.cn/hnzscqj/";
         //-------------广东省知识产权保护中心
 //        String url = "https://www.gippc.com.cn/ippc/index.shtml";
         //-------------海南知识产权局
@@ -315,7 +321,7 @@ class PpaApplicationTests {
         //-------------宁夏市场监督管理局
 //        String url = "http://scjg.nx.gov.cn/";
         //-------------新疆市场监督管理局(新疆知识产权局)
-        String url = "https://scjgj.xinjiang.gov.cn/";
+//        String url = "https://scjgj.xinjiang.gov.cn/";
         getProvinceNewsService.crawlArticles(url,"",1,2);
 //        String url = "https://www.ahippc.cn/news.html?categoryId=a5e96b641ade4fc9b50b4f9504ba0f62";
 //        final List<GetArticleInfoDTO> articleInfoDTOS = getProvinceNewsService.crawlArticlesDetail(url);
@@ -327,13 +333,6 @@ class PpaApplicationTests {
 
     @Test
     void addArticleFromWebSource() throws Exception {
-//        String url = "https://scjg.hebei.gov.cn/";
-//        getProvinceNewsService.crawlHebeiArticles(url);
-//        String url = "https://scjg.hebei.gov.cn/node/919";
-//        getProvinceNewsService.addHebeiArticle(url);
-//        String url = "https://zscqj.hubei.gov.cn/";
-//        getProvinceNewsService.crawlHubeiArticles(url);
-//        getProvinceNewsService.crawlJiangxiArticles("https://amr.jiangxi.gov.cn/");
         getProvinceNewsService.addArticleFromWebSource();
 
     }
@@ -383,7 +382,17 @@ class PpaApplicationTests {
     }
 
     @Test
-    void test111() throws Exception {
+    void crawZhiNanZhenArticle() throws Exception {
+        getZhiNanZhenArticleService.crawZhiNanZhenArticle();
+    }
+
+    @Test
+    void crawCniprArticle() throws Exception {
+        getCNIPRArticleService.crawCniprArticle();
+    }
+
+    @Test
+    void crawIprDailyArticle() throws Exception {
         //-------------中国知识产权网
 //        String url = "http://www.cnipr.com/";
 //        getProvinceNewsService.test(url);
@@ -394,5 +403,8 @@ class PpaApplicationTests {
 //        String url = "https://www.worldip.cn/index.php?m=content&c=index&a=show&catid=64&id=2996";
 //        getProvinceNewsService.getDigest2(url);
 
+        getIprDailyArticleService.crawIprDailyArticle();
+
+//        getIprDailyArticleService.getDigest("http://www.iprdaily.cn/news_40675.html");
     }
 }