package com.cslg.ppa.service.GetWebArticle; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.cslg.ppa.common.utils.DateUtil; import com.cslg.ppa.dto.GetArticleInfoDTO; import com.cslg.ppa.entity.ArticleInfo; import com.cslg.ppa.entity.SourceInfo; import com.cslg.ppa.entity.commom.Article; import com.cslg.ppa.entity.commom.WxResultBody; import com.cslg.ppa.mapper.SourceInfoMapper; import com.cslg.ppa.service.ArticleInfoService; import com.cslg.ppa.service.commom.DifyService; import com.cslg.ppa.service.commom.FileManagerService; import com.cslg.ppa.service.commom.WeiXinApi; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; import org.springframework.util.CollectionUtils; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Files; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @Slf4j @Service @RequiredArgsConstructor public class GetWeChatArticleService { @Value("${WeChat.token}") private String token; @Value("${WeChat.cookie}") private String cookie; @Autowired private SourceInfoMapper sourceInfoMapper; @Autowired private ArticleInfoService articleInfoService; @Autowired private DifyService difyService; @Autowired private FileManagerService fileManagerService; @Scheduled(cron = "0 0 4 * * ?") @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class) public void getWeChatArticle() throws Exception { System.out.println(new Date() + "Wechat-Begin"); List sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper() .eq(SourceInfo::getSourceType, 2)); for (SourceInfo sourceInfo : sourceInfos) { final String fakeId = sourceInfo.getFakeId(); WxResultBody> findExList = WeiXinApi.findExList(fakeId, token,cookie); List
exList = findExList.getApp_msg_list(); List articleInfoDTOS = new ArrayList<>(); for (Article article : exList) { String createTimeSecondStr = article.getCreate_time(); long secondCreateTime = Long.parseLong(createTimeSecondStr); String createTimeStr = DateUtil.convertTimestamp(secondCreateTime); String yesterdayDateStr = DateUtil.getYesterdayDateStr(); if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) { continue; } Date createTime = new Date(); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); try { createTime = dateFormat.parse(createTimeStr); } catch (Exception e) { continue; } String title = article.getTitle(); String link = article.getLink(); String cover = article.getCover(); // 检查文章是否已存在 ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper().lambda().eq(ArticleInfo::getTitle, title)); if (ObjectUtils.isNotEmpty(articleInfo)) { continue; // 文章已存在,跳过 } //获取公众号内容 String weChatArticleContent = null; try { weChatArticleContent = this.getWeChatArticleContent(link); } catch (Exception e) { weChatArticleContent = ""; } if (StringUtils.isEmpty(weChatArticleContent)) { continue; } String condensedAbstract = null; // String pctCondensedAbstract = null; try { condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent); // pctCondensedAbstract = difyService.getPctCondensedAbstract(weChatArticleContent); } catch (Exception e) { continue; } if (StringUtils.isEmpty(condensedAbstract)) { continue; } GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO(); switch (sourceInfo.getSourceName()) { case "国专知识产权": articleInfoDTO.setCategoryId(4); break; case "IPRdaily": articleInfoDTO.setCategoryId(3); break; case "知识产权界": articleInfoDTO.setCategoryId(5); break; case "Bayes美国知识产权": articleInfoDTO.setCategoryId(9); break; default: articleInfoDTO.setCategoryId(6); break; } articleInfoDTO.setSourceId(sourceInfo.getId()); articleInfoDTO.setArticleUrl(link); articleInfoDTO.setTitle(title); articleInfoDTO.setPublicTime(createTime); if (StringUtils.isNotEmpty(cover)) { String guid = this.getGuid(cover); articleInfoDTO.setWxArticleIcon(guid); } articleInfoDTO.setDigest(condensedAbstract); // articleInfoDTO.setPctDigest(pctCondensedAbstract); articleInfoDTOS.add(articleInfoDTO); } articleInfoService.batchAddArticleInfo(articleInfoDTOS); } System.out.println(new Date() + "Wechat-End"); } public String getWeChatArticleContent(String articleUrl) { String content = ""; try (CloseableHttpClient httpClient = HttpClients.createDefault()) { HttpGet request = new HttpGet(articleUrl); // 设置完整的请求头(关键步骤!) request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"); request.setHeader("Referer", "https://mp.weixin.qq.com/"); request.setHeader("Cookie", cookie); //执行请求并处理响应 try (CloseableHttpResponse response = httpClient.execute(request)) { final int code = response.getStatusLine().getStatusCode(); // 检查响应状态码 if (code != 200) { System.err.println("请求失败,状态码: " + code); return content; } // 4. 解析HTML内容 String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8"); Document doc = Jsoup.parse(htmlContent); // 5. 提取文章正文(微信使用特定class) Element contentElement = doc.selectFirst("#js_content"); if (contentElement != null) { List list = new ArrayList<>(); Elements elements = contentElement.select("p"); for (Element element : elements) { String text = element.text(); if (StringUtils.isNotEmpty(text)) { list.add(text); } } if (CollectionUtils.isEmpty(list)) { String articleContent = contentElement.text(); list.add(articleContent); content = StringUtils.join(list, "\n"); } else { content = StringUtils.join(list, "\n"); } } } } catch (Exception e) { } return content; } public void getWeChatArticleContent1(String articleUrl,Integer categoryId,Integer sourceId) { try (CloseableHttpClient httpClient = HttpClients.createDefault()) { HttpGet request = new HttpGet(articleUrl); // 设置完整的请求头(关键步骤!) request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"); request.setHeader("Referer", "https://mp.weixin.qq.com/"); request.setHeader("Cookie", cookie); //执行请求并处理响应 try (CloseableHttpResponse response = httpClient.execute(request)) { final int code = response.getStatusLine().getStatusCode(); // 检查响应状态码 if (code != 200) { System.err.println("请求失败,状态码: " + code); } // 4. 解析HTML内容 String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8"); Document doc = Jsoup.parse(htmlContent); Elements h1 = doc.select("h1"); String title = h1.text(); Elements scripts = doc.select("script"); String createTimeStr = ""; for (Element script : scripts) { String scriptContent = script.html(); // 提取 var ct 的值(字符串或数字) createTimeStr = extractVariable(scriptContent, "ct"); if (StringUtils.isNotEmpty(createTimeStr)) { break; } } Date createTime = new Date(); if (StringUtils.isNotEmpty(createTimeStr)) { long secondCreateTime = Long.parseLong(createTimeStr); String createTimeStr1 = DateUtil.convertTimestamp(secondCreateTime); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); try { createTime = dateFormat.parse(createTimeStr1); } catch (Exception e) { } } // 5. 提取文章正文(微信使用特定class) String content = ""; Element contentElement = doc.selectFirst("#js_content"); if (contentElement != null) { List list = new ArrayList<>(); Elements elements = contentElement.select("p"); for (Element element : elements) { String text = element.text(); if (StringUtils.isNotEmpty(text)) { list.add(text); } } if (CollectionUtils.isEmpty(list)) { String articleContent = contentElement.text(); list.add(articleContent); content = StringUtils.join(list, "\n"); } else { content = StringUtils.join(list, "\n"); } } String condensedAbstract = difyService.getCondensedAbstract(content); // String pctCondensedAbstract = difyService.getPctCondensedAbstract(content); ArticleInfo articleInfo = new ArticleInfo(); articleInfo.setTitle(title); articleInfo.setDigest(condensedAbstract); // articleInfo.setPctDigest(pctCondensedAbstract); articleInfo.setCategoryId(categoryId); articleInfo.setSourceId(sourceId); articleInfo.setPublicTime(createTime); articleInfo.setArticleUrl(articleUrl); articleInfo.insert(); } } catch (Exception e) { } } private static String extractVariable(String scriptContent, String varName) { // 正则表达式匹配变量赋值(支持字符串或数字) Pattern pattern = Pattern.compile( "var\\s+" + varName + "\\s*=\\s*(['\"])?(.*?)\\1\\s*;", // 匹配 var varName = 'value' 或 var varName = 123; Pattern.DOTALL ); Matcher matcher = pattern.matcher(scriptContent); if (matcher.find()) { return matcher.group(2).trim(); // 返回捕获的值(第二组) } return null; } public String getGuid(String url) throws Exception { String guid = ""; try { URL fileUrl = new URL(url); HttpURLConnection connection = (HttpURLConnection) fileUrl.openConnection(); File tempFile = File.createTempFile("tem-", ".jpeg"); try (InputStream in = connection.getInputStream(); FileOutputStream out = new FileOutputStream(tempFile)) { IOUtils.copy(in, out); } List list = null; try { list = fileManagerService.uploadFileGetGuid2(Collections.singletonList(tempFile)); } catch (IOException e) { list = new ArrayList<>(); } Files.delete(tempFile.toPath()); if (!CollectionUtils.isEmpty(list)) { guid = list.get(0); } else { guid = url; } } catch (Exception e) { return url; } return guid; } }