stjdydayou há 4 anos atrás
pai
commit
79d169e18b

+ 3 - 7
pom.xml

@@ -30,6 +30,7 @@
         <lombok.version>1.18.22</lombok.version>
         <javax.servlet.version>4.0.1</javax.servlet.version>
         <okhttp.version>4.9.3</okhttp.version>
+        <webmagic.version>0.7.5</webmagic.version>
 
 
         <afirma-lib-jmimemagic.version>0.0.6</afirma-lib-jmimemagic.version>
@@ -138,17 +139,12 @@
         <dependency>
             <groupId>us.codecraft</groupId>
             <artifactId>webmagic-core</artifactId>
-            <version>0.7.5</version>
+            <version>${webmagic.version}</version>
         </dependency>
         <dependency>
             <groupId>us.codecraft</groupId>
             <artifactId>webmagic-extension</artifactId>
-            <version>0.7.5</version>
-        </dependency>
-        <dependency>
-            <groupId>com.hankcs.hanlp.restful</groupId>
-            <artifactId>hanlp-restful</artifactId>
-            <version>0.0.7</version>
+            <version>${webmagic.version}</version>
         </dependency>
     </dependencies>
     <build>

+ 43 - 44
src/main/java/com/zhiqiyun/open/core/schedule/AutoEquipmentPassengerPeople.java

@@ -1,6 +1,5 @@
 package com.zhiqiyun.open.core.schedule;
 
-import com.alibaba.fastjson.JSON;
 import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
 import com.zhiqiyun.open.core.enmus.Gender;
 import com.zhiqiyun.open.core.enmus.PassengerType;
@@ -26,58 +25,58 @@ import java.util.List;
 @Slf4j
 @Component
 public class AutoEquipmentPassengerPeople {
-    @Autowired
-    private EquipmentPassengerService equipmentPassengerService;
+	@Autowired
+	private EquipmentPassengerService equipmentPassengerService;
 
-    @Autowired
-    private EquipmentPassengerPeopleService equipmentPassengerPeopleService;
+	@Autowired
+	private EquipmentPassengerPeopleService equipmentPassengerPeopleService;
 
-    @Autowired
-    private DictCityService dictCityService;
+	@Autowired
+	private DictCityService dictCityService;
 
-    @Autowired
-    private SequenceService sequenceService;
+	@Autowired
+	private SequenceService sequenceService;
 
-    @Autowired
-    private StringRedisTemplate stringRedisTemplate;
+	@Autowired
+	private StringRedisTemplate stringRedisTemplate;
 
-    @Scheduled(cron = "0/10 * * * * ?")
-    public void createPeople() {
-        QueryWrapper<DictCity> wrapper = new QueryWrapper<>();
-        wrapper.eq("parent_id", "0");
-        List<DictCity> listDictCity = this.dictCityService.list(wrapper);
-        List<EquipmentPassenger> listData = this.equipmentPassengerService.list();
-        for (EquipmentPassenger equipment : listData) {
+	@Scheduled(cron = "0/10 * * * * ?")
+	public void createPeople() {
+		QueryWrapper<DictCity> wrapper = new QueryWrapper<>();
+		wrapper.eq("parent_id", "0");
+		List<DictCity> listDictCity = this.dictCityService.list(wrapper);
+		List<EquipmentPassenger> listData = this.equipmentPassengerService.list();
+		for (EquipmentPassenger equipment : listData) {
 
-            int r1 = RandomUtil.getInt(0, listDictCity.size() - 1);
-            int r2 = r1 % 2 + 1;
-            PassengerType type = PassengerType.valueOf(r2);
+			int r1 = RandomUtil.getInt(0, listDictCity.size() - 1);
+			int r2 = r1 % 2 + 1;
+			PassengerType type = PassengerType.valueOf(r2);
 
-            EquipmentPassengerPeople people = new EquipmentPassengerPeople();
-            people.setId(this.sequenceService.nextId());
-            people.setPlaceBaseInfoId(equipment.getPlaceBaseInfoId());
-            people.setEquipmentId(equipment.getId());
-            people.setPassengerType(type);
-            people.setCreatedTime(DateUtil.current());
-            people.setGender(Gender.valueOf(r2));
+			EquipmentPassengerPeople people = new EquipmentPassengerPeople();
+			people.setId(this.sequenceService.nextId());
+			people.setPlaceBaseInfoId(equipment.getPlaceBaseInfoId());
+			people.setEquipmentId(equipment.getId());
+			people.setPassengerType(type);
+			people.setCreatedTime(DateUtil.current());
+			people.setGender(Gender.valueOf(r2));
 
 
-            people.setFromSource(listDictCity.get(r1).getFullName());
+			people.setFromSource(listDictCity.get(r1).getFullName());
 
 
-            String redisKey = String.format("equipment%s", equipment.getId());
-            BoundListOperations<String, String> boundListOps = this.stringRedisTemplate.boundListOps(redisKey);
-            String faceId;
-            if (PassengerType.IN.equals(type)) {
-                faceId = RandomUtil.getuuid();
-                boundListOps.leftPush(faceId);
-            } else {
-                faceId = boundListOps.rightPop();
-            }
-            if (StringUtils.isNotBlank(faceId)) {
-                people.setFaceId(faceId);
-                this.equipmentPassengerPeopleService.save(people);
-            }
-        }
-    }
+			String redisKey = String.format("equipment%s", equipment.getId());
+			BoundListOperations<String, String> boundListOps = this.stringRedisTemplate.boundListOps(redisKey);
+			String faceId;
+			if (PassengerType.IN.equals(type)) {
+				faceId = RandomUtil.getuuid();
+				boundListOps.leftPush(faceId);
+			} else {
+				faceId = boundListOps.rightPop();
+			}
+			if (StringUtils.isNotBlank(faceId)) {
+				people.setFaceId(faceId);
+				this.equipmentPassengerPeopleService.save(people);
+			}
+		}
+	}
 }

+ 1 - 0
src/main/resources/logback-spring.xml

@@ -18,6 +18,7 @@
     <logger name="com.zaxxer" level="INFO" />
     <logger name="org.ehcache" level="INFO" />
     <logger name="org.apache.http" level="INFO" />
+    <logger name="us.codecraft.webmagic" level="INFO" />
 
     <property name="LOG_PATH" value="${LOGGER_ROOT_PATH}/${SPRING_APP_NAME}" />
     <property name="MAX_HISTORY" value="10" />

+ 0 - 24
src/test/java/com/zhiqiyun/HanLPTest.java

@@ -1,24 +0,0 @@
-package com.zhiqiyun;
-
-import com.alibaba.fastjson.JSON;
-import com.hankcs.hanlp.restful.HanLPClient;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-public class HanLPTest {
-    @Test
-    public void textHanLP() throws IOException {
-        HanLPClient hanLP = new HanLPClient("https://hanlp.hankcs.com/api", null, "mul", 1500);
-        String document = "算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。\n"
-                + "算法可以宽泛的分为三类,\n" + "一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。\n"
-                + "二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。\n"
-                + "三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。";
-
-        Map<String, List> dataMap = hanLP.parse(document);
-
-        System.out.println(JSON.toJSONString(dataMap));
-    }
-}

+ 160 - 150
src/test/java/com/zhiqiyun/SimplePageProcessor.java

@@ -21,154 +21,164 @@ import java.util.regex.Pattern;
 
 @Slf4j
 public class SimplePageProcessor {
-    private final Site site = Site.me().setDomain("hongdou.gxnews.com.cn").setRetryTimes(3).setSleepTime(1000);
-
-    public static void main(String[] args) {
-        PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
-        rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
-        rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
-        rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
-
-        Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
-
-        List<Pipeline> pipelines = new ArrayList<>();
-        pipelines.add(new Pipeline() {
-            @Override
-            public void process(ResultItems resultItems, Task task) {
-                Map<String, Object> dataMap = resultItems.getAll();
-
-                String url = resultItems.getRequest().getUrl();
-                String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
-
-                String title = dataMap.getOrDefault("title", "").toString();
-                String keywords = dataMap.getOrDefault("keywords", "").toString();
-                String description = dataMap.getOrDefault("description", "").toString();
-                String bodyText = dataMap.getOrDefault("bodyText", "").toString();
-                String html = dataMap.getOrDefault("html", "").toString();
-                List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
-
-                String[] listKeywords = new String[]{"旅游投诉", "违法", "柳城"};
-                log.info(JSON.toJSONString(listFragments));
-
-                for (String s : listFragments) {
-
-                }
-//                boolean flag = true;
-//                for (String k : listKeywords) {
-//                    if (!bodyText.contains(k)) {
-//                        flag = false;
-//                        break;
-//                    }
-//                }
-            }
-        });
-        spider.setPipelines(pipelines);
-        spider.setExitWhenComplete(true);
-        spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
-        spider.runAsync();
-    }
-
-
-    static class PopularFeelingsProcessor implements PageProcessor {
-        private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
-
-        private final Site site;
-
-        private final PopularFeelingsSiteRule rule;
-
-
-        public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
-            this.rule = rule;
-            this.site = Site.me();
-            this.site.setUseGzip(true);
-            this.site.setRetryTimes(3);
-            this.site.setSleepTime(1000);
-        }
-
-        @Override
-        public void process(Page page) {
-            List<String> listUrls = page.getHtml().links().all();
-            listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
-
-            String urlPatterns = this.rule.getUrlPatterns();
-            if (StringUtils.isNotBlank(urlPatterns)) {
-                String[] listPatterns = urlPatterns.split("\n");
-                listUrls.removeIf(url -> {
-                    boolean hasMatched = false;
-                    for (String regex : listPatterns) {
-                        hasMatched = Pattern.matches(regex, url);
-                        if (hasMatched) {
-                            break;
-                        }
-                    }
-                    return !hasMatched;
-                });
-            }
-
-            LIST_SPIDER_URLS.addAll(listUrls);
-
-            page.addTargetRequests(listUrls);
-
-            Document document = page.getHtml().getDocument();
-
-            String title = document.title();
-            String bodyText = document.text();
-            String html = document.html();
-
-            String keywords = "";
-            String description = "";
-            Elements elements = document.getElementsByTag("meta");
-
-            if (elements != null && elements.size() > 0) {
-                for (Element element : elements) {
-                    String metaName = element.attr("name");
-                    String metaContent = element.attr("content");
-                    if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
-                        keywords = metaContent;
-                    }
-                    if (StringUtils.equalsIgnoreCase("description", metaName)) {
-                        description = metaContent;
-                    }
-                }
-            }
-
-            if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
-                description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
-            }
-
-            if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
-                keywords = description;
-            }
-
-            Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
-            String selectableHtml = selectable.get();
-
-            List<String> listFragments = new ArrayList<>();
-
-            if (StringUtils.isNotBlank(selectableHtml)) {
-                document.html(selectableHtml);
-
-                Elements allElements = document.children().first().children();
-                for (Element element : allElements) {
-                    String text = element.text();
-                    if (StringUtils.isNotBlank(text)) {
-                        listFragments.add(text.trim());
-                    }
-                }
-            }
-
-
-            page.putField("title", title);
-            page.putField("bodyText", bodyText);
-            page.putField("html", html);
-            page.putField("keywords", keywords);
-            page.putField("description", description);
-            page.putField("listFragments", JSON.toJSONString(listFragments));
-        }
-
-        @Override
-        public Site getSite() {
-            return this.site;
-        }
-    }
+	public static void main(String[] args) {
+		PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
+		rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
+		rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
+		rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
+
+		Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
+
+		List<Pipeline> pipelines = new ArrayList<>();
+		pipelines.add(new Pipeline() {
+			@Override
+			public void process(ResultItems resultItems, Task task) {
+				Map<String, Object> dataMap = resultItems.getAll();
+
+				String url = resultItems.getRequest().getUrl();
+				String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
+
+				String title = dataMap.getOrDefault("title", "").toString();
+				String keywords = dataMap.getOrDefault("keywords", "").toString();
+				String description = dataMap.getOrDefault("description", "").toString();
+				String bodyText = dataMap.getOrDefault("bodyText", "").toString();
+				String html = dataMap.getOrDefault("html", "").toString();
+				List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
+
+				String[] listKeywords = new String[]{"十三届五次会", "习近平"};
+
+				boolean flag = true;
+				for (String keyword : listKeywords) {
+					if (!bodyText.contains(keyword)) {
+						flag = false;
+						break;
+					}
+				}
+
+				listFragments.removeIf(o -> {
+					boolean hasContains = false;
+					for (String keyword : listKeywords) {
+						if (o.contains(keyword)) {
+							hasContains = true;
+							break;
+						}
+					}
+					return !hasContains;
+				});
+
+				log.info("{}>>>>>>>>{}>>>>>>>>{}", flag, listFragments.size(), bodyText);
+			}
+		});
+		spider.setPipelines(pipelines);
+		spider.setExitWhenComplete(true);
+		spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
+		spider.runAsync();
+	}
+
+
+	static class PopularFeelingsProcessor implements PageProcessor {
+		private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
+
+		private final Site site;
+
+		private final PopularFeelingsSiteRule rule;
+
+
+		public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
+			this.rule = rule;
+			this.site = Site.me();
+			this.site.setUseGzip(true);
+			this.site.setRetryTimes(3);
+			this.site.setSleepTime(1000);
+		}
+
+		@Override
+		public void process(Page page) {
+			List<String> listUrls = page.getHtml().links().all();
+			listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
+
+			String urlPatterns = this.rule.getUrlPatterns();
+			if (StringUtils.isNotBlank(urlPatterns)) {
+				String[] listPatterns = urlPatterns.split("\n");
+				listUrls.removeIf(url -> {
+					boolean hasMatched = false;
+					for (String regex : listPatterns) {
+						hasMatched = Pattern.matches(regex, url);
+						if (hasMatched) {
+							break;
+						}
+					}
+					return !hasMatched;
+				});
+			}
+
+			LIST_SPIDER_URLS.addAll(listUrls);
+
+			page.addTargetRequests(listUrls);
+
+			Document document = page.getHtml().getDocument();
+
+			String title = document.title();
+			String bodyText = "";
+			String html = document.html();
+
+			String keywords = "";
+			String description = "";
+			Elements elements = document.getElementsByTag("meta");
+
+			if (elements != null && elements.size() > 0) {
+				for (Element element : elements) {
+					String metaName = element.attr("name");
+					String metaContent = element.attr("content");
+					if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
+						keywords = metaContent;
+					}
+					if (StringUtils.equalsIgnoreCase("description", metaName)) {
+						description = metaContent;
+					}
+				}
+			}
+
+			Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
+
+			List<String> listFragments = new ArrayList<>();
+			if (selectable.match()) {
+				document.html(selectable.get());
+				bodyText = document.text();
+
+				Elements allElements = document.children().first().children();
+				for (Element element : allElements) {
+					String text = element.text();
+					if (StringUtils.isNotBlank(text)) {
+						listFragments.add(text.trim());
+					}
+				}
+			}
+
+			if (StringUtils.isBlank(bodyText)) {
+				page.setSkip(true);
+			}
+
+
+			if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
+				description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
+			}
+
+			if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
+				keywords = description;
+			}
+
+			page.putField("title", title);
+			page.putField("bodyText", bodyText);
+			page.putField("html", html);
+			page.putField("keywords", keywords);
+			page.putField("description", description);
+			page.putField("listFragments", JSON.toJSONString(listFragments));
+		}
+
+		@Override
+		public Site getSite() {
+			return this.site;
+		}
+	}
 }