|
@@ -0,0 +1,211 @@
|
|
|
+package com.cslg.ppa.service.GetWebArticle;
|
|
|
+
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
|
+import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
|
|
|
+import com.cslg.ppa.common.utils.DateUtil;
|
|
|
+import com.cslg.ppa.dto.GetArticleInfoDTO;
|
|
|
+import com.cslg.ppa.entity.ArticleInfo;
|
|
|
+import com.cslg.ppa.entity.SourceInfo;
|
|
|
+import com.cslg.ppa.mapper.SourceInfoMapper;
|
|
|
+import com.cslg.ppa.service.ArticleInfoService;
|
|
|
+import com.cslg.ppa.service.commom.DifyService;
|
|
|
+import lombok.RequiredArgsConstructor;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import okhttp3.OkHttpClient;
|
|
|
+import okhttp3.Request;
|
|
|
+import org.apache.commons.lang3.ObjectUtils;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.apache.http.HttpResponse;
|
|
|
+import org.apache.http.client.methods.HttpGet;
|
|
|
+import org.apache.http.impl.client.CloseableHttpClient;
|
|
|
+import org.apache.http.impl.client.HttpClients;
|
|
|
+import org.apache.http.util.EntityUtils;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+import org.springframework.beans.factory.annotation.Autowired;
|
|
|
+import org.springframework.scheduling.annotation.Scheduled;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+import org.springframework.transaction.annotation.Propagation;
|
|
|
+import org.springframework.transaction.annotation.Transactional;
|
|
|
+import org.springframework.util.CollectionUtils;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.text.SimpleDateFormat;
|
|
|
+import java.util.*;
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
+
|
|
|
+@Slf4j
|
|
|
+@Service
|
|
|
+@RequiredArgsConstructor
|
|
|
+public class GetIprDailyArticleService {
|
|
|
+ private final ArticleInfoService articleInfoService;
|
|
|
+ @Autowired
|
|
|
+ private DifyService difyService;
|
|
|
+ @Autowired
|
|
|
+ private SourceInfoMapper sourceInfoMapper;
|
|
|
+
|
|
|
+ private static final String BaseUrl = "http://www.iprdaily.cn/";
|
|
|
+ private static final Integer BaseId = 40;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * IprDaily中文网
|
|
|
+ * @throws Exception
|
|
|
+ */
|
|
|
+// @Scheduled(cron = "0 15 0 * * ?")
|
|
|
+ @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
|
|
|
+ public void crawIprDailyArticle() throws Exception {
|
|
|
+ System.out.println(new Date() + "IprDaily-Begin");
|
|
|
+ Map<String, Integer> urlMap = new HashMap<>();
|
|
|
+ //案例
|
|
|
+ urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=9", 3);
|
|
|
+ //行业----诉讼
|
|
|
+ urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=11", 5);
|
|
|
+ //行业----国际视野
|
|
|
+ urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=15", 5);
|
|
|
+ //行业----行业
|
|
|
+ urlMap.put("http://www.iprdaily.cn/index.php?s=api&c=api&m=template&name=get_news.html&catid=8", 5);
|
|
|
+ for (String key : urlMap.keySet()) {
|
|
|
+ Integer type = urlMap.get(key);
|
|
|
+ try {
|
|
|
+ OkHttpClient okHttpClient = new OkHttpClient.Builder()
|
|
|
+ .connectTimeout(60, TimeUnit.SECONDS)
|
|
|
+ .writeTimeout(60, TimeUnit.SECONDS)
|
|
|
+ .readTimeout(60, TimeUnit.SECONDS)
|
|
|
+ .build();
|
|
|
+ Request request = new Request.Builder()
|
|
|
+ .url(key)
|
|
|
+ .get()
|
|
|
+ .build();
|
|
|
+ String res = Objects.requireNonNull(okHttpClient.newCall(request).execute().body()).string();
|
|
|
+ JSONObject parseObject = JSONObject.parseObject(res);
|
|
|
+ String htmlContent = parseObject.getString("msg");
|
|
|
+// final Integer code = parseObject.getInteger("code");
|
|
|
+ this.addIprDailyArticle(htmlContent, BaseUrl,BaseId,type);
|
|
|
+ } catch (IOException e) {
|
|
|
+ System.out.println("Get IPRDaily Web Article Error:" + e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ System.out.println(new Date() + "IprDaily-End");
|
|
|
+ }
|
|
|
+
|
|
|
+ public void addIprDailyArticle(String htmlContent, String baseUrl,Integer sourceId, Integer type) throws IOException {
|
|
|
+ Document doc = Jsoup.parse(htmlContent);
|
|
|
+ // 解析每个li标签
|
|
|
+ Elements liElements = doc.select("li.box-list");
|
|
|
+ List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
|
|
|
+ for (Element liElement : liElements) {
|
|
|
+ // 提取time中的日期
|
|
|
+ Element timeElement = liElement.selectFirst("dd.time");
|
|
|
+ String dateStr = timeElement.text();
|
|
|
+ String yesterdayDateStr = DateUtil.getYesterdayDateStr();
|
|
|
+ if (!StringUtils.equals(dateStr, yesterdayDateStr)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ Date date = new Date();
|
|
|
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
|
|
|
+ try {
|
|
|
+ date = dateFormat.parse(dateStr);
|
|
|
+ } catch (Exception e) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取首个title
|
|
|
+ Element titleElement = liElement.selectFirst("dt.title");
|
|
|
+ String title = titleElement.text();
|
|
|
+ // 检查文章是否已存在
|
|
|
+ long count = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
|
|
|
+ .eq(ArticleInfo::getTitle, title));
|
|
|
+ if (count > 0) {
|
|
|
+ continue; // 文章已存在,跳过
|
|
|
+ }
|
|
|
+
|
|
|
+ Element imgBox = liElement.selectFirst("a.img-box");
|
|
|
+ String link = imgBox.attr("href");
|
|
|
+ if (StringUtils.isEmpty(link)) {
|
|
|
+ link = imgBox.attr("href");
|
|
|
+ }
|
|
|
+ if (link.startsWith("/")) {
|
|
|
+ // 提取基础URL的协议和域名
|
|
|
+ int endIndex = baseUrl.indexOf("/", 8); // 跳过http://或https://
|
|
|
+ String baseDomain = endIndex > 0 ? baseUrl.substring(0, endIndex) : baseUrl;
|
|
|
+ link = baseDomain + link;
|
|
|
+ } else if (!link.startsWith("http")) {
|
|
|
+ // 处理相对路径
|
|
|
+ int lastSlash = baseUrl.lastIndexOf("/");
|
|
|
+ String basePath = lastSlash > 0 ? baseUrl.substring(0, lastSlash + 1) : baseUrl + "/";
|
|
|
+ link = basePath + link;
|
|
|
+ }
|
|
|
+ String digest = null;
|
|
|
+ try {
|
|
|
+ digest = this.getDigest(link);
|
|
|
+ } catch (Exception e) {
|
|
|
+ digest = "";
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(digest)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ String condensedAbstract = null;
|
|
|
+ try {
|
|
|
+ condensedAbstract = difyService.getCondensedAbstract(digest);
|
|
|
+ } catch (Exception ignored) {
|
|
|
+
|
|
|
+ }
|
|
|
+ if (StringUtils.isEmpty(condensedAbstract)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
|
|
|
+ articleInfoDTO.setCategoryId(type);
|
|
|
+ articleInfoDTO.setSourceId(sourceId);
|
|
|
+ articleInfoDTO.setArticleUrl(link);
|
|
|
+ articleInfoDTO.setTitle(title);
|
|
|
+ articleInfoDTO.setPublicTime(date);
|
|
|
+ articleInfoDTO.setDigest(condensedAbstract);
|
|
|
+ articleInfoDTOS.add(articleInfoDTO);
|
|
|
+ }
|
|
|
+ articleInfoService.batchAddArticleInfo(articleInfoDTOS);
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getDigest(String url) throws IOException {
|
|
|
+ String digest = "";
|
|
|
+ CloseableHttpClient httpClient = HttpClients.createDefault();
|
|
|
+ HttpGet request = new HttpGet(url);
|
|
|
+ HttpResponse response = httpClient.execute(request);
|
|
|
+ String responseBody = EntityUtils.toString(response.getEntity());
|
|
|
+ httpClient.close();
|
|
|
+ if (responseBody != null) {
|
|
|
+ digest = this.readJson(responseBody);
|
|
|
+ }
|
|
|
+ return digest;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String readJson(String responseBody) {
|
|
|
+ String content = "";
|
|
|
+ try {
|
|
|
+ //使用JSoup解析HTML
|
|
|
+ Document doc = Jsoup.parse(responseBody);
|
|
|
+ // 获取description元数据
|
|
|
+ Element metaElement = doc.selectFirst("dl.article-con");
|
|
|
+ Elements elements = metaElement.select("p");
|
|
|
+ List<String> list = new ArrayList<>();
|
|
|
+ for (Element element : elements) {
|
|
|
+ String text = element.text();
|
|
|
+ if (StringUtils.isNotEmpty(text)) {
|
|
|
+ list.add(text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (CollectionUtils.isEmpty(list)) {
|
|
|
+ String articleContent = elements.text();
|
|
|
+ list.add(articleContent);
|
|
|
+ content = StringUtils.join(list, "\n");
|
|
|
+ } else {
|
|
|
+ content = StringUtils.join(list, "\n");
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ content = "";
|
|
|
+ }
|
|
|
+ return content;
|
|
|
+ }
|
|
|
+}
|