123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- package com.cslg.ppa.service.GetWebArticle;
- import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
- import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
- import com.cslg.ppa.common.utils.DateUtil;
- import com.cslg.ppa.dto.GetArticleInfoDTO;
- import com.cslg.ppa.entity.ArticleInfo;
- import com.cslg.ppa.entity.SourceInfo;
- import com.cslg.ppa.entity.commom.Article;
- import com.cslg.ppa.entity.commom.WxResultBody;
- import com.cslg.ppa.mapper.SourceInfoMapper;
- import com.cslg.ppa.service.ArticleInfoService;
- import com.cslg.ppa.service.commom.DifyService;
- import com.cslg.ppa.service.commom.FileManagerService;
- import com.cslg.ppa.service.commom.WeiXinApi;
- import lombok.RequiredArgsConstructor;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.io.IOUtils;
- import org.apache.commons.lang3.ObjectUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.scheduling.annotation.Scheduled;
- import org.springframework.stereotype.Service;
- import org.springframework.transaction.annotation.Propagation;
- import org.springframework.transaction.annotation.Transactional;
- import org.springframework.util.CollectionUtils;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.nio.file.Files;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Collections;
- import java.util.Date;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.stream.Collectors;
- @Slf4j
- @Service
- @RequiredArgsConstructor
- public class GetWeChatArticleService {
- @Value("${WeChat.token}")
- private String token;
- @Value("${WeChat.cookie}")
- private String cookie;
- @Autowired
- private SourceInfoMapper sourceInfoMapper;
- @Autowired
- private ArticleInfoService articleInfoService;
- @Autowired
- private DifyService difyService;
- @Autowired
- private FileManagerService fileManagerService;
- @Scheduled(cron = "0 0 4 * * ?")
- @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
- public void getWeChatArticle() throws Exception {
- System.out.println(new Date() + "Wechat-Begin");
- List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
- .eq(SourceInfo::getSourceType, 2));
- for (SourceInfo sourceInfo : sourceInfos) {
- final String fakeId = sourceInfo.getFakeId();
- WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token,cookie);
- List<Article> exList = findExList.getApp_msg_list();
- List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
- for (Article article : exList) {
- String createTimeSecondStr = article.getCreate_time();
- long secondCreateTime = Long.parseLong(createTimeSecondStr);
- String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
- String yesterdayDateStr = DateUtil.getYesterdayDateStr();
- if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
- continue;
- }
- Date createTime = new Date();
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
- try {
- createTime = dateFormat.parse(createTimeStr);
- } catch (Exception e) {
- continue;
- }
- String title = article.getTitle();
- String link = article.getLink();
- String cover = article.getCover();
- // 检查文章是否已存在
- ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
- if (ObjectUtils.isNotEmpty(articleInfo)) {
- continue; // 文章已存在,跳过
- }
- //获取公众号内容
- String weChatArticleContent = null;
- try {
- weChatArticleContent = this.getWeChatArticleContent(link);
- } catch (Exception e) {
- weChatArticleContent = "";
- }
- if (StringUtils.isEmpty(weChatArticleContent)) {
- continue;
- }
- String condensedAbstract = null;
- // String pctCondensedAbstract = null;
- try {
- condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
- // pctCondensedAbstract = difyService.getPctCondensedAbstract(weChatArticleContent);
- } catch (Exception e) {
- continue;
- }
- if (StringUtils.isEmpty(condensedAbstract)) {
- continue;
- }
- GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
- switch (sourceInfo.getSourceName()) {
- case "国专知识产权":
- articleInfoDTO.setCategoryId(4);
- break;
- case "IPRdaily":
- articleInfoDTO.setCategoryId(3);
- break;
- case "知识产权界":
- articleInfoDTO.setCategoryId(5);
- break;
- case "Bayes美国知识产权":
- articleInfoDTO.setCategoryId(9);
- break;
- default:
- articleInfoDTO.setCategoryId(6);
- break;
- }
- articleInfoDTO.setSourceId(sourceInfo.getId());
- articleInfoDTO.setArticleUrl(link);
- articleInfoDTO.setTitle(title);
- articleInfoDTO.setPublicTime(createTime);
- if (StringUtils.isNotEmpty(cover)) {
- String guid = this.getGuid(cover);
- articleInfoDTO.setWxArticleIcon(guid);
- }
- articleInfoDTO.setDigest(condensedAbstract);
- // articleInfoDTO.setPctDigest(pctCondensedAbstract);
- articleInfoDTOS.add(articleInfoDTO);
- }
- articleInfoService.batchAddArticleInfo(articleInfoDTOS);
- }
- System.out.println(new Date() + "Wechat-End");
- }
- public String getWeChatArticleContent(String articleUrl) {
- String content = "";
- try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
- HttpGet request = new HttpGet(articleUrl);
- // 设置完整的请求头(关键步骤!)
- request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
- request.setHeader("Referer", "https://mp.weixin.qq.com/");
- request.setHeader("Cookie", cookie);
- //执行请求并处理响应
- try (CloseableHttpResponse response = httpClient.execute(request)) {
- final int code = response.getStatusLine().getStatusCode();
- // 检查响应状态码
- if (code != 200) {
- System.err.println("请求失败,状态码: " + code);
- return content;
- }
- // 4. 解析HTML内容
- String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
- Document doc = Jsoup.parse(htmlContent);
- // 5. 提取文章正文(微信使用特定class)
- Element contentElement = doc.selectFirst("#js_content");
- if (contentElement != null) {
- List<String> list = new ArrayList<>();
- Elements elements = contentElement.select("p");
- for (Element element : elements) {
- String text = element.text();
- if (StringUtils.isNotEmpty(text)) {
- list.add(text);
- }
- }
- if (CollectionUtils.isEmpty(list)) {
- String articleContent = contentElement.text();
- list.add(articleContent);
- content = StringUtils.join(list, "\n");
- } else {
- content = StringUtils.join(list, "\n");
- }
- }
- }
- } catch (Exception e) {
- }
- return content;
- }
- public void getWeChatArticleContent1(String articleUrl,Integer categoryId,Integer sourceId) {
- try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
- HttpGet request = new HttpGet(articleUrl);
- // 设置完整的请求头(关键步骤!)
- request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
- request.setHeader("Referer", "https://mp.weixin.qq.com/");
- request.setHeader("Cookie", cookie);
- //执行请求并处理响应
- try (CloseableHttpResponse response = httpClient.execute(request)) {
- final int code = response.getStatusLine().getStatusCode();
- // 检查响应状态码
- if (code != 200) {
- System.err.println("请求失败,状态码: " + code);
- }
- // 4. 解析HTML内容
- String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
- Document doc = Jsoup.parse(htmlContent);
- Elements h1 = doc.select("h1");
- String title = h1.text();
- Elements scripts = doc.select("script");
- String createTimeStr = "";
- for (Element script : scripts) {
- String scriptContent = script.html();
- // 提取 var ct 的值(字符串或数字)
- createTimeStr = extractVariable(scriptContent, "ct");
- if (StringUtils.isNotEmpty(createTimeStr)) {
- break;
- }
- }
- Date createTime = new Date();
- if (StringUtils.isNotEmpty(createTimeStr)) {
- long secondCreateTime = Long.parseLong(createTimeStr);
- String createTimeStr1 = DateUtil.convertTimestamp(secondCreateTime);
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
- try {
- createTime = dateFormat.parse(createTimeStr1);
- } catch (Exception e) {
- }
- }
- // 5. 提取文章正文(微信使用特定class)
- String content = "";
- Element contentElement = doc.selectFirst("#js_content");
- if (contentElement != null) {
- List<String> list = new ArrayList<>();
- Elements elements = contentElement.select("p");
- for (Element element : elements) {
- String text = element.text();
- if (StringUtils.isNotEmpty(text)) {
- list.add(text);
- }
- }
- if (CollectionUtils.isEmpty(list)) {
- String articleContent = contentElement.text();
- list.add(articleContent);
- content = StringUtils.join(list, "\n");
- } else {
- content = StringUtils.join(list, "\n");
- }
- }
- String condensedAbstract = difyService.getCondensedAbstract(content);
- // String pctCondensedAbstract = difyService.getPctCondensedAbstract(content);
- ArticleInfo articleInfo = new ArticleInfo();
- articleInfo.setTitle(title);
- articleInfo.setDigest(condensedAbstract);
- // articleInfo.setPctDigest(pctCondensedAbstract);
- articleInfo.setCategoryId(categoryId);
- articleInfo.setSourceId(sourceId);
- articleInfo.setPublicTime(createTime);
- articleInfo.setArticleUrl(articleUrl);
- articleInfo.insert();
- }
- } catch (Exception e) {
- }
- }
- private static String extractVariable(String scriptContent, String varName) {
- // 正则表达式匹配变量赋值(支持字符串或数字)
- Pattern pattern = Pattern.compile(
- "var\\s+" + varName + "\\s*=\\s*(['\"])?(.*?)\\1\\s*;", // 匹配 var varName = 'value' 或 var varName = 123;
- Pattern.DOTALL
- );
- Matcher matcher = pattern.matcher(scriptContent);
- if (matcher.find()) {
- return matcher.group(2).trim(); // 返回捕获的值(第二组)
- }
- return null;
- }
- public String getGuid(String url) throws Exception {
- String guid = "";
- try {
- URL fileUrl = new URL(url);
- HttpURLConnection connection = (HttpURLConnection) fileUrl.openConnection();
- File tempFile = File.createTempFile("tem-", ".jpeg");
- try (InputStream in = connection.getInputStream(); FileOutputStream out = new FileOutputStream(tempFile)) {
- IOUtils.copy(in, out);
- }
- List<String> list = null;
- try {
- list = fileManagerService.uploadFileGetGuid2(Collections.singletonList(tempFile));
- } catch (IOException e) {
- list = new ArrayList<>();
- }
- Files.delete(tempFile.toPath());
- if (!CollectionUtils.isEmpty(list)) {
- guid = list.get(0);
- } else {
- guid = url;
- }
- } catch (Exception e) {
- return url;
- }
- return guid;
- }
- }
|