GetWeChatArticleService.java 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. package com.cslg.ppa.service.GetWebArticle;
  2. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  3. import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
  4. import com.cslg.ppa.common.core.base.RedisConf;
  5. import com.cslg.ppa.common.utils.DateUtil;
  6. import com.cslg.ppa.common.utils.RedisUtil;
  7. import com.cslg.ppa.dto.GetArticleInfoDTO;
  8. import com.cslg.ppa.entity.ArticleInfo;
  9. import com.cslg.ppa.entity.SourceInfo;
  10. import com.cslg.ppa.entity.commom.Article;
  11. import com.cslg.ppa.entity.commom.WxResultBody;
  12. import com.cslg.ppa.mapper.SourceInfoMapper;
  13. import com.cslg.ppa.service.ArticleInfoService;
  14. import com.cslg.ppa.service.commom.DifyService;
  15. import com.cslg.ppa.service.commom.FileManagerService;
  16. import com.cslg.ppa.service.commom.WeiXinApi;
  17. import lombok.RequiredArgsConstructor;
  18. import lombok.extern.slf4j.Slf4j;
  19. import org.apache.commons.io.IOUtils;
  20. import org.apache.commons.lang3.ObjectUtils;
  21. import org.apache.commons.lang3.StringUtils;
  22. import org.apache.http.client.methods.CloseableHttpResponse;
  23. import org.apache.http.client.methods.HttpGet;
  24. import org.apache.http.impl.client.CloseableHttpClient;
  25. import org.apache.http.impl.client.HttpClients;
  26. import org.apache.http.util.EntityUtils;
  27. import org.jsoup.Jsoup;
  28. import org.jsoup.nodes.Document;
  29. import org.jsoup.nodes.Element;
  30. import org.jsoup.select.Elements;
  31. import org.springframework.beans.factory.annotation.Autowired;
  32. import org.springframework.beans.factory.annotation.Value;
  33. import org.springframework.scheduling.annotation.Scheduled;
  34. import org.springframework.stereotype.Service;
  35. import org.springframework.transaction.annotation.Propagation;
  36. import org.springframework.transaction.annotation.Transactional;
  37. import org.springframework.util.CollectionUtils;
  38. import java.io.File;
  39. import java.io.FileOutputStream;
  40. import java.io.IOException;
  41. import java.io.InputStream;
  42. import java.net.HttpURLConnection;
  43. import java.net.MalformedURLException;
  44. import java.net.URL;
  45. import java.nio.file.Files;
  46. import java.text.SimpleDateFormat;
  47. import java.util.ArrayList;
  48. import java.util.Collections;
  49. import java.util.Date;
  50. import java.util.List;
  51. import java.util.regex.Matcher;
  52. import java.util.regex.Pattern;
  53. import java.util.stream.Collectors;
  54. @Slf4j
  55. @Service
  56. @RequiredArgsConstructor
  57. public class GetWeChatArticleService {
  58. @Autowired
  59. private SourceInfoMapper sourceInfoMapper;
  60. @Autowired
  61. private ArticleInfoService articleInfoService;
  62. @Autowired
  63. private DifyService difyService;
  64. @Autowired
  65. private FileManagerService fileManagerService;
  66. @Autowired
  67. private RedisUtil redisUtil;
  68. // @Scheduled(cron = "0 0 6 * * ?")
  69. // @Transactional(propagation = Propagation.REQUIRED, rollbackFor = Throwable.class)
  70. public void getWeChatArticle() throws Exception {
  71. String token = redisUtil.get(RedisConf.WECHAT_TOKEN + RedisConf.SYMBOL_COLON);
  72. String cookie = redisUtil.get(RedisConf.WECHAT_COOKIE + RedisConf.SYMBOL_COLON);
  73. System.out.println(new Date() + "Wechat-Begin");
  74. List<SourceInfo> sourceInfos = sourceInfoMapper.selectList(new LambdaQueryWrapper<SourceInfo>()
  75. .eq(SourceInfo::getSourceType, 2));
  76. for (SourceInfo sourceInfo : sourceInfos) {
  77. final String fakeId = sourceInfo.getFakeId();
  78. WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token, cookie);
  79. List<Article> exList = findExList.getApp_msg_list();
  80. List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
  81. for (Article article : exList) {
  82. String createTimeSecondStr = article.getCreate_time();
  83. long secondCreateTime = Long.parseLong(createTimeSecondStr);
  84. String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
  85. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  86. if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
  87. break;
  88. }
  89. Date createTime = new Date();
  90. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  91. try {
  92. createTime = dateFormat.parse(createTimeStr);
  93. } catch (Exception e) {
  94. continue;
  95. }
  96. String title = article.getTitle();
  97. String link = article.getLink();
  98. String cover = article.getCover();
  99. // 检查文章是否已存在
  100. long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  101. .eq(ArticleInfo::getTitle, title)
  102. .or()
  103. .eq(ArticleInfo::getArticleUrl, link));
  104. if (sum > 0) {
  105. continue; // 文章已存在,跳过
  106. }
  107. //获取公众号内容
  108. String weChatArticleContent = null;
  109. try {
  110. weChatArticleContent = this.getWeChatArticleContent(link, cookie);
  111. } catch (Exception e) {
  112. weChatArticleContent = "";
  113. }
  114. if (StringUtils.isEmpty(weChatArticleContent)) {
  115. continue;
  116. }
  117. String condensedAbstract = null;
  118. try {
  119. condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
  120. } catch (Exception e) {
  121. continue;
  122. }
  123. if (StringUtils.isEmpty(condensedAbstract)) {
  124. continue;
  125. }
  126. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  127. switch (sourceInfo.getSourceName()) {
  128. case "国专知识产权公众号":
  129. case "知识产权界公众号":
  130. case "IPRdaily公众号":
  131. case "Bayes美国知识产权公众号":
  132. case "知识产权那点事公众号":
  133. articleInfoDTO.setCategoryId(this.wxArticleClassify(title));
  134. break;
  135. case "跨域知见公众号":
  136. articleInfoDTO.setCategoryId(12);
  137. break;
  138. default:
  139. articleInfoDTO.setCategoryId(6);
  140. break;
  141. }
  142. articleInfoDTO.setSourceId(sourceInfo.getId());
  143. articleInfoDTO.setArticleUrl(link);
  144. articleInfoDTO.setTitle(title);
  145. articleInfoDTO.setPublicTime(createTime);
  146. if (StringUtils.isNotEmpty(cover)) {
  147. String guid = this.getGuid(cover);
  148. articleInfoDTO.setWxArticleIcon(guid);
  149. }
  150. articleInfoDTO.setDigest(condensedAbstract);
  151. articleInfoDTOS.add(articleInfoDTO);
  152. }
  153. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  154. }
  155. System.out.println(new Date() + "Wechat-End");
  156. }
  157. public void saveWeChatArticle(String fakeId, String sourceName, Integer sourceInfoId) {
  158. String token = redisUtil.get(RedisConf.WECHAT_TOKEN + RedisConf.SYMBOL_COLON);
  159. String cookie = redisUtil.get(RedisConf.WECHAT_COOKIE + RedisConf.SYMBOL_COLON);
  160. WxResultBody<List<Article>> findExList = WeiXinApi.findExList(fakeId, token, cookie);
  161. List<Article> exList = findExList.getApp_msg_list();
  162. List<GetArticleInfoDTO> articleInfoDTOS = new ArrayList<>();
  163. for (Article article : exList) {
  164. String createTimeSecondStr = article.getCreate_time();
  165. long secondCreateTime = Long.parseLong(createTimeSecondStr);
  166. String createTimeStr = DateUtil.convertTimestamp(secondCreateTime);
  167. String yesterdayDateStr = DateUtil.getYesterdayDateStr();
  168. if (!StringUtils.equals(createTimeStr, yesterdayDateStr)) {
  169. break;
  170. }
  171. Date createTime = new Date();
  172. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  173. try {
  174. createTime = dateFormat.parse(createTimeStr);
  175. } catch (Exception e) {
  176. continue;
  177. }
  178. String title = article.getTitle().trim();
  179. String link = article.getLink();
  180. String cover = article.getCover();
  181. // 检查文章是否已存在
  182. long sum = articleInfoService.count(new LambdaQueryWrapper<ArticleInfo>()
  183. .eq(ArticleInfo::getTitle, title)
  184. .or()
  185. .eq(ArticleInfo::getArticleUrl, link));
  186. if (sum > 0) {
  187. continue; // 文章已存在,跳过
  188. }
  189. //获取公众号内容
  190. String weChatArticleContent = null;
  191. try {
  192. weChatArticleContent = this.getWeChatArticleContent(link,cookie);
  193. } catch (Exception e) {
  194. weChatArticleContent = "";
  195. }
  196. if (StringUtils.isEmpty(weChatArticleContent)) {
  197. continue;
  198. }
  199. String condensedAbstract = null;
  200. try {
  201. condensedAbstract = difyService.getCondensedAbstract(weChatArticleContent);
  202. } catch (Exception e) {
  203. continue;
  204. }
  205. if (StringUtils.isEmpty(condensedAbstract)) {
  206. continue;
  207. }
  208. GetArticleInfoDTO articleInfoDTO = new GetArticleInfoDTO();
  209. switch (sourceName) {
  210. case "国专知识产权公众号":
  211. case "知识产权界公众号":
  212. case "IPRdaily公众号":
  213. case "Bayes美国知识产权公众号":
  214. articleInfoDTO.setCategoryId(this.wxArticleClassify(title));
  215. break;
  216. case "跨域知见公众号":
  217. articleInfoDTO.setCategoryId(12);
  218. break;
  219. default:
  220. articleInfoDTO.setCategoryId(6);
  221. break;
  222. }
  223. articleInfoDTO.setSourceId(sourceInfoId);
  224. articleInfoDTO.setArticleUrl(link);
  225. articleInfoDTO.setTitle(title);
  226. articleInfoDTO.setPublicTime(createTime);
  227. if (StringUtils.isNotEmpty(cover)) {
  228. String guid = this.getGuid(cover);
  229. articleInfoDTO.setWxArticleIcon(guid);
  230. }
  231. articleInfoDTO.setDigest(condensedAbstract);
  232. articleInfoDTOS.add(articleInfoDTO);
  233. }
  234. articleInfoService.batchAddArticleInfo(articleInfoDTOS);
  235. }
  236. public Integer wxArticleClassify(String content) {
  237. int classify = 6;
  238. try {
  239. String classifyStr = difyService.getArticleClassify(content);
  240. if (StringUtils.isNotEmpty(classifyStr)) {
  241. if (classifyStr.contains("判例")) {
  242. classify = 3;
  243. } else if (classifyStr.contains("国外")) {
  244. classify = 4;
  245. } else if (classifyStr.contains("行业")) {
  246. classify = 5;
  247. }
  248. }
  249. } catch (Exception e) {
  250. return classify;
  251. }
  252. return classify;
  253. }
  254. public String getWeChatArticleContent(String articleUrl, String cookie) {
  255. String content = "";
  256. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  257. HttpGet request = new HttpGet(articleUrl);
  258. // 设置完整的请求头(关键步骤!)
  259. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  260. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  261. request.setHeader("Cookie", cookie);
  262. //执行请求并处理响应
  263. try (CloseableHttpResponse response = httpClient.execute(request)) {
  264. final int code = response.getStatusLine().getStatusCode();
  265. // 检查响应状态码
  266. if (code != 200) {
  267. System.err.println("请求失败,状态码: " + code);
  268. return content;
  269. }
  270. // 4. 解析HTML内容
  271. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  272. Document doc = Jsoup.parse(htmlContent);
  273. // 5. 提取文章正文(微信使用特定class)
  274. Element contentElement = doc.selectFirst("#js_content");
  275. if (contentElement != null) {
  276. List<String> list = new ArrayList<>();
  277. Elements elements = contentElement.select("p");
  278. for (Element element : elements) {
  279. String text = element.text();
  280. if (StringUtils.isNotEmpty(text)) {
  281. list.add(text);
  282. }
  283. }
  284. if (CollectionUtils.isEmpty(list) || list.size() < 5) {
  285. String articleContent = contentElement.text();
  286. list.add(articleContent);
  287. content = StringUtils.join(list, "\n");
  288. } else {
  289. content = StringUtils.join(list, "\n");
  290. }
  291. }
  292. }
  293. } catch (Exception e) {
  294. }
  295. return content;
  296. }
  297. public String getGuid(String url) {
  298. String guid = "";
  299. try {
  300. URL fileUrl = new URL(url);
  301. HttpURLConnection connection = (HttpURLConnection) fileUrl.openConnection();
  302. File tempFile = File.createTempFile("tem-", ".jpeg");
  303. try (InputStream in = connection.getInputStream(); FileOutputStream out = new FileOutputStream(tempFile)) {
  304. IOUtils.copy(in, out);
  305. }
  306. List<String> list = null;
  307. try {
  308. list = fileManagerService.uploadFileGetGuid2(Collections.singletonList(tempFile));
  309. } catch (IOException e) {
  310. list = new ArrayList<>();
  311. }
  312. Files.delete(tempFile.toPath());
  313. if (!CollectionUtils.isEmpty(list)) {
  314. guid = list.get(0);
  315. } else {
  316. guid = url;
  317. }
  318. } catch (Exception e) {
  319. return url;
  320. }
  321. return guid;
  322. }
  323. //测试使用方法1
  324. public void getWeChatArticleContent1(String articleUrl, Integer categoryId, Integer sourceId) {
  325. try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
  326. HttpGet request = new HttpGet(articleUrl);
  327. // 设置完整的请求头(关键步骤!)
  328. request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36");
  329. request.setHeader("Referer", "https://mp.weixin.qq.com/");
  330. // request.setHeader("Cookie", cookie);
  331. //执行请求并处理响应
  332. try (CloseableHttpResponse response = httpClient.execute(request)) {
  333. final int code = response.getStatusLine().getStatusCode();
  334. // 检查响应状态码
  335. if (code != 200) {
  336. System.err.println("请求失败,状态码: " + code);
  337. }
  338. // 4. 解析HTML内容
  339. String htmlContent = EntityUtils.toString(response.getEntity(), "UTF-8");
  340. Document doc = Jsoup.parse(htmlContent);
  341. Element h1 = doc.selectFirst("h1");
  342. String title = h1.text();
  343. Elements scripts = doc.select("script");
  344. String createTimeStr = "";
  345. for (Element script : scripts) {
  346. String scriptContent = script.html();
  347. // 提取 var ct 的值(字符串或数字)
  348. createTimeStr = extractVariable(scriptContent, "ct");
  349. if (StringUtils.isNotEmpty(createTimeStr)) {
  350. break;
  351. }
  352. }
  353. Date createTime = new Date();
  354. if (StringUtils.isNotEmpty(createTimeStr)) {
  355. long secondCreateTime = Long.parseLong(createTimeStr);
  356. String createTimeStr1 = DateUtil.convertTimestamp(secondCreateTime);
  357. SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
  358. try {
  359. createTime = dateFormat.parse(createTimeStr1);
  360. } catch (Exception e) {
  361. }
  362. }
  363. // 5. 提取文章正文(微信使用特定class)
  364. String content = "";
  365. Element contentElement = doc.selectFirst("#js_content");
  366. if (contentElement != null) {
  367. List<String> list = new ArrayList<>();
  368. Elements elements = contentElement.select("p");
  369. for (Element element : elements) {
  370. String text = element.text();
  371. if (StringUtils.isNotEmpty(text)) {
  372. list.add(text);
  373. }
  374. }
  375. if (CollectionUtils.isEmpty(list) || list.size() < 5) {
  376. String articleContent = contentElement.text();
  377. list.add(articleContent);
  378. content = StringUtils.join(list, "\n");
  379. } else {
  380. content = StringUtils.join(list, "\n");
  381. }
  382. }
  383. // String condensedAbstract = difyService.getCondensedAbstract(content);
  384. String condensedAbstract = difyService.getZGYDSummary(content);
  385. ArticleInfo articleInfo = new ArticleInfo();
  386. articleInfo.setTitle(title);
  387. articleInfo.setDigest(condensedAbstract);
  388. articleInfo.setCategoryId(categoryId);
  389. articleInfo.setSourceId(sourceId);
  390. articleInfo.setPublicTime(createTime);
  391. articleInfo.setArticleUrl(articleUrl);
  392. articleInfo.insert();
  393. }
  394. } catch (Exception e) {
  395. }
  396. }
  397. ////测试使用方法2
  398. private static String extractVariable(String scriptContent, String varName) {
  399. // 正则表达式匹配变量赋值(支持字符串或数字)
  400. Pattern pattern = Pattern.compile(
  401. "var\\s+" + varName + "\\s*=\\s*(['\"])?(.*?)\\1\\s*;", // 匹配 var varName = 'value' 或 var varName = 123;
  402. Pattern.DOTALL
  403. );
  404. Matcher matcher = pattern.matcher(scriptContent);
  405. if (matcher.find()) {
  406. return matcher.group(2).trim(); // 返回捕获的值(第二组)
  407. }
  408. return null;
  409. }
  410. }