GetWeChatArticleService.java 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. package com.cslg.ppa.service.GetWebArticle;
  2. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  3. import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
  4. import com.cslg.ppa.common.utils.DateUtil;
  5. import com.cslg.ppa.dto.GetArticleInfoDTO;
  6. import com.cslg.ppa.entity.ArticleInfo;
  7. import com.cslg.ppa.entity.SourceInfo;
  8. import com.cslg.ppa.entity.commom.Article;
  9. import com.cslg.ppa.entity.commom.WxResultBody;
  10. import com.cslg.ppa.mapper.SourceInfoMapper;
  11. import com.cslg.ppa.service.ArticleInfoService;
  12. import com.cslg.ppa.service.commom.DifyService;
  13. import com.cslg.ppa.service.commom.FileManagerService;
  14. import com.cslg.ppa.service.commom.WeiXinApi;
  15. import lombok.RequiredArgsConstructor;
  16. import lombok.extern.slf4j.Slf4j;
  17. import org.apache.commons.io.IOUtils;
  18. import org.apache.commons.lang3.ObjectUtils;
  19. import org.apache.commons.lang3.StringUtils;
  20. import org.apache.http.client.methods.CloseableHttpResponse;
  21. import org.apache.http.client.methods.HttpGet;
  22. import org.apache.http.impl.client.CloseableHttpClient;
  23. import org.apache.http.impl.client.HttpClients;
  24. import org.apache.http.util.EntityUtils;
  25. import org.jsoup.Jsoup;
  26. import org.jsoup.nodes.Document;
  27. import org.jsoup.nodes.Element;
  28. import org.jsoup.select.Elements;
  29. import org.springframework.beans.factory.annotation.Autowired;
  30. import org.springframework.beans.factory.annotation.Value;
  31. import org.springframework.scheduling.annotation.Scheduled;
  32. import org.springframework.stereotype.Service;
  33. import org.springframework.transaction.annotation.Propagation;
  34. import org.springframework.transaction.annotation.Transactional;
  35. import org.springframework.util.CollectionUtils;
  36. import java.io.File;
  37. import java.io.FileOutputStream;
  38. import java.io.IOException;
  39. import java.io.InputStream;
  40. import java.net.HttpURLConnection;
  41. import java.net.MalformedURLException;
  42. import java.net.URL;
  43. import java.nio.file.Files;
  44. import java.text.SimpleDateFormat;
  45. import java.util.ArrayList;
  46. import java.util.Collections;
  47. import java.util.Date;
  48. import java.util.List;
  49. import java.util.regex.Matcher;
  50. import java.util.regex.Pattern;
  51. import java.util.stream.Collectors;
  52. @Slf4j
  53. @Service
  54. @RequiredArgsConstructor
  55. public class GetWeChatArticleService {
  56. @Value("${WeChat.token}")
  57. private String token;
  58. @Value("${WeChat.cookie}")
  59. private String cookie;
  60. @Autowired
  61. private SourceInfoMapper sourceInfoMapper;
  62. @Autowired
  63. private ArticleInfoService articleInfoService;
  64. @Autowired
  65. private DifyService difyService;
  66. @Autowired
  67. private FileManagerService fileManagerService;
  68. @Scheduled(cron = "0 0 4 * * ?")
  69. @Transactional(propagation = Propagation.REQUIRED,rollbackFor = Throwable.class)
  70. public void getWeChatArticle() throws Exception {
  71. System.out.println(new Date() + "Wechat-Begin");
  72. List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
  73. .eq(SourceInfo::getSourceType, 2));
  74. for (SourceInfo sourceInfo : sourceInfos) {
  75. final String fakeId = sourceInfo.getFakeId();
  76. WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token,cookie);
  77. List<Article> exList = findExList.getApp_msg_list();
  78. List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
  79. for (Article article : exList) {
  80. String createTimeSecondStr = article.getCreate_time();
  81. long secondCreateTime = Long.parseLong(createTimeSecondStr);
  82. String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
  83. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  84. if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
  85. continue;
  86. }
  87. Date createTime = new Date();
  88. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  89. try {
  90. createTime = dateFormat.parse(createTimeStr);
  91. } catch (Exception e) {
  92. continue;
  93. }
  94. String title = article.getTitle();
  95. String link = article.getLink();
  96. String cover = article.getCover();
  97. // 检查文章是否已存在
  98. ArticleInfo articleInfo = articleInfoService.getOne(new QueryWrapper<ArticleInfo>().lambda().eq(ArticleInfo::getTitle, title));
  99. if (ObjectUtils.isNotEmpty(articleInfo)) {
  100. continue; // 文章已存在,跳过
  101. }
  102. //获取公众号内容
  103. String weChatArticleContent = null;
  104. try {
  105. weChatArticleContent = this.getWeChatArticleContent(link);
  106. } catch (Exception e) {
  107. weChatArticleContent = "";
  108. }
  109. if (StringUtils.isEmpty(weChatArticleContent)) {
  110. continue;
  111. }
  112. String condensedAbstract = null;
  113. // String pctCondensedAbstract = null;
  114. try {
  115. condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
  116. // pctCondensedAbstract = difyService.getPctCondensedAbstract(weChatArticleContent);
  117. } catch (Exception e) {
  118. continue;
  119. }
  120. if (StringUtils.isEmpty(condensedAbstract)) {
  121. continue;
  122. }
  123. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  124. switch (sourceInfo.getSourceName()) {
  125. case "国专知识产权":
  126. articleInfoDTO.setCategoryId(4);
  127. break;
  128. case "IPRdaily":
  129. articleInfoDTO.setCategoryId(3);
  130. break;
  131. case "知识产权界":
  132. articleInfoDTO.setCategoryId(5);
  133. break;
  134. case "Bayes美国知识产权":
  135. articleInfoDTO.setCategoryId(9);
  136. break;
  137. default:
  138. articleInfoDTO.setCategoryId(6);
  139. break;
  140. }
  141. articleInfoDTO.setSourceId(sourceInfo.getId());
  142. articleInfoDTO.setArticleUrl(link);
  143. articleInfoDTO.setTitle(title);
  144. articleInfoDTO.setPublicTime(createTime);
  145. if (StringUtils.isNotEmpty(cover)) {
  146. String guid = this.getGuid(cover);
  147. articleInfoDTO.setWxArticleIcon(guid);
  148. }
  149. articleInfoDTO.setDigest(condensedAbstract);
  150. // articleInfoDTO.setPctDigest(pctCondensedAbstract);
  151. articleInfoDTOS.add(articleInfoDTO);
  152. }
  153. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  154. }
  155. System.out.println(new Date() + "Wechat-End");
  156. }
  157. public String getWeChatArticleContent(String articleUrl) {
  158. String content = "";
  159. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  160. HttpGet request = new HttpGet(articleUrl);
  161. // 设置完整的请求头(关键步骤!)
  162. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  163. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  164. request.setHeader("Cookie", cookie);
  165. //执行请求并处理响应
  166. try (CloseableHttpResponse response = httpClient.execute(request)) {
  167. final int code = response.getStatusLine().getStatusCode();
  168. // 检查响应状态码
  169. if (code != 200) {
  170. System.err.println("请求失败,状态码: " + code);
  171. return content;
  172. }
  173. // 4. 解析HTML内容
  174. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  175. Document doc = Jsoup.parse(htmlContent);
  176. // 5. 提取文章正文(微信使用特定class)
  177. Element contentElement = doc.selectFirst("#js_content");
  178. if (contentElement != null) {
  179. List<String> list = new ArrayList<>();
  180. Elements elements = contentElement.select("p");
  181. for (Element element : elements) {
  182. String text = element.text();
  183. if (StringUtils.isNotEmpty(text)) {
  184. list.add(text);
  185. }
  186. }
  187. if (CollectionUtils.isEmpty(list)) {
  188. String articleContent = contentElement.text();
  189. list.add(articleContent);
  190. content = StringUtils.join(list, "\n");
  191. } else {
  192. content = StringUtils.join(list, "\n");
  193. }
  194. }
  195. }
  196. } catch (Exception e) {
  197. }
  198. return content;
  199. }
  200. public void getWeChatArticleContent1(String articleUrl,Integer categoryId,Integer sourceId) {
  201. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  202. HttpGet request = new HttpGet(articleUrl);
  203. // 设置完整的请求头(关键步骤!)
  204. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  205. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  206. request.setHeader("Cookie", cookie);
  207. //执行请求并处理响应
  208. try (CloseableHttpResponse response = httpClient.execute(request)) {
  209. final int code = response.getStatusLine().getStatusCode();
  210. // 检查响应状态码
  211. if (code != 200) {
  212. System.err.println("请求失败,状态码: " + code);
  213. }
  214. // 4. 解析HTML内容
  215. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  216. Document doc = Jsoup.parse(htmlContent);
  217. Elements h1 = doc.select("h1");
  218. String title = h1.text();
  219. Elements scripts = doc.select("script");
  220. String createTimeStr = "";
  221. for (Element script : scripts) {
  222. String scriptContent = script.html();
  223. // 提取 var ct 的值(字符串或数字)
  224. createTimeStr = extractVariable(scriptContent, "ct");
  225. if (StringUtils.isNotEmpty(createTimeStr)) {
  226. break;
  227. }
  228. }
  229. Date createTime = new Date();
  230. if (StringUtils.isNotEmpty(createTimeStr)) {
  231. long secondCreateTime = Long.parseLong(createTimeStr);
  232. String createTimeStr1 = DateUtil.convertTimestamp(secondCreateTime);
  233. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  234. try {
  235. createTime = dateFormat.parse(createTimeStr1);
  236. } catch (Exception e) {
  237. }
  238. }
  239. // 5. 提取文章正文(微信使用特定class)
  240. String content = "";
  241. Element contentElement = doc.selectFirst("#js_content");
  242. if (contentElement != null) {
  243. List<String> list = new ArrayList<>();
  244. Elements elements = contentElement.select("p");
  245. for (Element element : elements) {
  246. String text = element.text();
  247. if (StringUtils.isNotEmpty(text)) {
  248. list.add(text);
  249. }
  250. }
  251. if (CollectionUtils.isEmpty(list)) {
  252. String articleContent = contentElement.text();
  253. list.add(articleContent);
  254. content = StringUtils.join(list, "\n");
  255. } else {
  256. content = StringUtils.join(list, "\n");
  257. }
  258. }
  259. String condensedAbstract = difyService.getCondensedAbstract(content);
  260. // String pctCondensedAbstract = difyService.getPctCondensedAbstract(content);
  261. ArticleInfo articleInfo = new ArticleInfo();
  262. articleInfo.setTitle(title);
  263. articleInfo.setDigest(condensedAbstract);
  264. // articleInfo.setPctDigest(pctCondensedAbstract);
  265. articleInfo.setCategoryId(categoryId);
  266. articleInfo.setSourceId(sourceId);
  267. articleInfo.setPublicTime(createTime);
  268. articleInfo.setArticleUrl(articleUrl);
  269. articleInfo.insert();
  270. }
  271. } catch (Exception e) {
  272. }
  273. }
  274. private static String extractVariable(String scriptContent, String varName) {
  275. // 正则表达式匹配变量赋值(支持字符串或数字)
  276. Pattern pattern = Pattern.compile(
  277. "var\\s+" + varName + "\\s*=\\s*(['\"])?(.*?)\\1\\s*;", // 匹配 var varName = 'value' 或 var varName = 123;
  278. Pattern.DOTALL
  279. );
  280. Matcher matcher = pattern.matcher(scriptContent);
  281. if (matcher.find()) {
  282. return matcher.group(2).trim(); // 返回捕获的值(第二组)
  283. }
  284. return null;
  285. }
  286. public String getGuid(String url) throws Exception {
  287. String guid = "";
  288. try {
  289. URL fileUrl = new URL(url);
  290. HttpURLConnection connection = (HttpURLConnection) fileUrl.openConnection();
  291. File tempFile = File.createTempFile("tem-", ".jpeg");
  292. try (InputStream in = connection.getInputStream(); FileOutputStream out = new FileOutputStream(tempFile)) {
  293. IOUtils.copy(in, out);
  294. }
  295. List<String> list = null;
  296. try {
  297. list = fileManagerService.uploadFileGetGuid2(Collections.singletonList(tempFile));
  298. } catch (IOException e) {
  299. list = new ArrayList<>();
  300. }
  301. Files.delete(tempFile.toPath());
  302. if (!CollectionUtils.isEmpty(list)) {
  303. guid = list.get(0);
  304. } else {
  305. guid = url;
  306. }
  307. } catch (Exception e) {
  308. return url;
  309. }
  310. return guid;
  311. }
  312. }