stjdydayou 4 年之前
父節點
當前提交
3d2d1e2770

+ 9 - 39
pom.xml

@@ -19,10 +19,10 @@
         <commons-codec.version>1.15</commons-codec.version>
         <ip2region.version>1.7.2</ip2region.version>
 
-        <spring.version>5.3.14</spring.version>
-        <spring-boot.version>2.6.3</spring-boot.version>
+        <spring.version>5.3.16</spring.version>
+        <spring-boot.version>2.6.4</spring-boot.version>
 
-        <mysql-connector-java.version>8.0.27</mysql-connector-java.version>
+        <mysql-connector-java.version>8.0.28</mysql-connector-java.version>
         <mybatis-spring-boot-starter.version>2.2.0</mybatis-spring-boot-starter.version>
         <mybatis.version>3.5.7</mybatis.version>
         <mybatis-plus-boot.version>3.5.1</mybatis-plus-boot.version>
@@ -33,7 +33,7 @@
 
 
         <afirma-lib-jmimemagic.version>0.0.6</afirma-lib-jmimemagic.version>
-        <thumbnailator.version>0.4.16</thumbnailator.version>
+        <thumbnailator.version>0.4.17</thumbnailator.version>
 
         <framework.version>1.0.20</framework.version>
         <db-migration.version>1.0.1</db-migration.version>
@@ -145,41 +145,11 @@
             <artifactId>webmagic-extension</artifactId>
             <version>0.7.5</version>
         </dependency>
-
-        <!--        <dependency>-->
-        <!--            <groupId>us.codecraft</groupId>-->
-        <!--            <artifactId>webmagic-core</artifactId>-->
-        <!--            <version>0.6.0</version>-->
-        <!--            <exclusions>-->
-        <!--                <exclusion>-->
-        <!--                    <groupId>org.slf4j</groupId>-->
-        <!--                    <artifactId>slf4j-log4j12</artifactId>-->
-        <!--                </exclusion>-->
-        <!--                <exclusion>-->
-        <!--                    <artifactId>slf4j-api</artifactId>-->
-        <!--                    <groupId>org.slf4j</groupId>-->
-        <!--                </exclusion>-->
-        <!--                <exclusion>-->
-        <!--                    <artifactId>guava</artifactId>-->
-        <!--                    <groupId>com.google.guava</groupId>-->
-        <!--                </exclusion>-->
-        <!--                <exclusion>-->
-        <!--                    <artifactId>commons-lang3</artifactId>-->
-        <!--                    <groupId>org.apache.commons</groupId>-->
-        <!--                </exclusion>-->
-        <!--            </exclusions>-->
-        <!--        </dependency>-->
-        <!--        <dependency>-->
-        <!--            <groupId>us.codecraft</groupId>-->
-        <!--            <artifactId>webmagic-extension</artifactId>-->
-        <!--            <version>0.6.0</version>-->
-        <!--            <exclusions>-->
-        <!--                <exclusion>-->
-        <!--                    <groupId>org.slf4j</groupId>-->
-        <!--                    <artifactId>slf4j-log4j12</artifactId>-->
-        <!--                </exclusion>-->
-        <!--            </exclusions>-->
-        <!--        </dependency>-->
+        <dependency>
+            <groupId>com.hankcs.hanlp.restful</groupId>
+            <artifactId>hanlp-restful</artifactId>
+            <version>0.0.7</version>
+        </dependency>
     </dependencies>
     <build>
         <plugins>

+ 34 - 0
src/main/java/com/zhiqiyun/open/core/models/statistics/PopularFeelingsSiteRule.java

@@ -0,0 +1,34 @@
+package com.zhiqiyun.open.core.models.statistics;
+
+import com.baomidou.mybatisplus.annotation.FieldStrategy;
+import com.baomidou.mybatisplus.annotation.TableField;
+import com.baomidou.mybatisplus.annotation.TableName;
+import com.zhiqiyun.open.core.typeHandler.FastjsonTypeHandler;
+import lombok.Data;
+
+import java.util.Date;
+import java.util.List;
+
+/**
+ * @author jtoms
+ */
+@Data
+@TableName(value = "popular_feelings", autoResultMap = true)
+public class PopularFeelingsSiteRule {
+    private Long id;
+    private String siteName;
+    private String urlPatterns;
+
+    @TableField(typeHandler = FastjsonTypeHandler.class)
+    private List<String> startUrls;
+
+    private String xpath;
+
+    @TableField(updateStrategy = FieldStrategy.NEVER)
+    private Date createdTime;
+    @TableField(updateStrategy = FieldStrategy.NEVER)
+    private Long createdBy;
+
+    private Date updatedTime;
+    private Long updatedBy;
+}

+ 0 - 2
src/main/resources/application-prod.properties

@@ -9,8 +9,6 @@ spring.redis.host=39.99.217.107
 spring.redis.password=hnylredis@
 spring.redis.port=6379
 spring.redis.database=0
-
-
 ####################### uploader config ###############################
 uploader.max-size=20480000
 uploader.file-host=http://47.114.32.188:9800/src

+ 24 - 0
src/test/java/com/zhiqiyun/HanLPTest.java

@@ -0,0 +1,24 @@
+package com.zhiqiyun;
+
+import com.alibaba.fastjson.JSON;
+import com.hankcs.hanlp.restful.HanLPClient;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+public class HanLPTest {
+    @Test
+    public void textHanLP() throws IOException {
+        HanLPClient hanLP = new HanLPClient("https://hanlp.hankcs.com/api", null, "mul", 1500);
+        String document = "算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。\n"
+                + "算法可以宽泛的分为三类,\n" + "一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。\n"
+                + "二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。\n"
+                + "三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。";
+
+        Map<String, List> dataMap = hanLP.parse(document);
+
+        System.out.println(JSON.toJSONString(dataMap));
+    }
+}

+ 154 - 24
src/test/java/com/zhiqiyun/SimplePageProcessor.java

@@ -1,44 +1,174 @@
 package com.zhiqiyun;
 
+import com.alibaba.fastjson.JSON;
+import com.zhiqiyun.open.core.models.statistics.PopularFeelingsSiteRule;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.StringUtils;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Site;
-import us.codecraft.webmagic.Spider;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import us.codecraft.webmagic.*;
+import us.codecraft.webmagic.pipeline.Pipeline;
 import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Selectable;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
 
 @Slf4j
-public class SimplePageProcessor implements PageProcessor {
-    private final Site site = Site.me().setDomain("news.qq.com").setRetryTimes(3).setSleepTime(1000);
+public class SimplePageProcessor {
+    private final Site site = Site.me().setDomain("hongdou.gxnews.com.cn").setRetryTimes(3).setSleepTime(1000);
 
-    private static final List<String> LIST_SPIDER_URLS = new ArrayList<>();
+    public static void main(String[] args) {
+        PopularFeelingsSiteRule rule = new PopularFeelingsSiteRule();
+        rule.setStartUrls(Arrays.asList("https://www.cqn.com.cn/"));
+        rule.setUrlPatterns("https://www.cqn.com.cn/([a-zA-Z]+)/content/([0-9]+)-([0-9]+)/([0-9]+)/content_([0-9]+).htm");
+        rule.setXpath("/html/body/div[4]/div[1]/div[3]/div[1]");
 
-    public SimplePageProcessor() {
-    }
+        Spider spider = Spider.create(new SimplePageProcessor.PopularFeelingsProcessor(rule));
+
+        List<Pipeline> pipelines = new ArrayList<>();
+        pipelines.add(new Pipeline() {
+            @Override
+            public void process(ResultItems resultItems, Task task) {
+                Map<String, Object> dataMap = resultItems.getAll();
+
+                String url = resultItems.getRequest().getUrl();
+                String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
+
+                String title = dataMap.getOrDefault("title", "").toString();
+                String keywords = dataMap.getOrDefault("keywords", "").toString();
+                String description = dataMap.getOrDefault("description", "").toString();
+                String bodyText = dataMap.getOrDefault("bodyText", "").toString();
+                String html = dataMap.getOrDefault("html", "").toString();
+                List<String> listFragments = JSON.parseArray(dataMap.getOrDefault("listFragments", "[]").toString(), String.class);
 
-    public void process(Page page) {
-        List<String> listUrls = page.getHtml().links().all();
-        listUrls.removeIf(s -> {
-            return LIST_SPIDER_URLS.contains(s) || StringUtils.isBlank(s);
+                String[] listKeywords = new String[]{"旅游投诉", "违法", "柳城"};
+                log.info(JSON.toJSONString(listFragments));
+
+                for (String s : listFragments) {
+
+                }
+//                boolean flag = true;
+//                for (String k : listKeywords) {
+//                    if (!bodyText.contains(k)) {
+//                        flag = false;
+//                        break;
+//                    }
+//                }
+            }
         });
-        LIST_SPIDER_URLS.addAll(listUrls);
-        log.info(">>>>>>>>{}", LIST_SPIDER_URLS.size());
-        page.addTargetRequests(listUrls);
+        spider.setPipelines(pipelines);
+        spider.setExitWhenComplete(true);
+        spider.addUrl(rule.getStartUrls().toArray(new String[]{}));
+        spider.runAsync();
+    }
+
 
-        page.putField("title", page.getHtml().getDocument().title());
-        if (page.getResultItems().get("title") == null) {
-            page.setSkip(true);
+    static class PopularFeelingsProcessor implements PageProcessor {
+        private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
+
+        private final Site site;
+
+        private final PopularFeelingsSiteRule rule;
+
+
+        public PopularFeelingsProcessor(PopularFeelingsSiteRule rule) {
+            this.rule = rule;
+            this.site = Site.me();
+            this.site.setUseGzip(true);
+            this.site.setRetryTimes(3);
+            this.site.setSleepTime(1000);
         }
-    }
 
-    public Site getSite() {
-        return this.site;
-    }
+        @Override
+        public void process(Page page) {
+            List<String> listUrls = page.getHtml().links().all();
+            listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
 
-    public static void main(String[] args) {
-        Spider.create(new SimplePageProcessor()).addUrl("https://news.qq.com/").run();
+            String urlPatterns = this.rule.getUrlPatterns();
+            if (StringUtils.isNotBlank(urlPatterns)) {
+                String[] listPatterns = urlPatterns.split("\n");
+                listUrls.removeIf(url -> {
+                    boolean hasMatched = false;
+                    for (String regex : listPatterns) {
+                        hasMatched = Pattern.matches(regex, url);
+                        if (hasMatched) {
+                            break;
+                        }
+                    }
+                    return !hasMatched;
+                });
+            }
+
+            LIST_SPIDER_URLS.addAll(listUrls);
+
+            page.addTargetRequests(listUrls);
+
+            Document document = page.getHtml().getDocument();
+
+            String title = document.title();
+            String bodyText = document.text();
+            String html = document.html();
+
+            String keywords = "";
+            String description = "";
+            Elements elements = document.getElementsByTag("meta");
+
+            if (elements != null && elements.size() > 0) {
+                for (Element element : elements) {
+                    String metaName = element.attr("name");
+                    String metaContent = element.attr("content");
+                    if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
+                        keywords = metaContent;
+                    }
+                    if (StringUtils.equalsIgnoreCase("description", metaName)) {
+                        description = metaContent;
+                    }
+                }
+            }
+
+            if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
+                description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
+            }
+
+            if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
+                keywords = description;
+            }
+
+            Selectable selectable = page.getHtml().xpath(this.rule.getXpath());
+            String selectableHtml = selectable.get();
+
+            List<String> listFragments = new ArrayList<>();
+
+            if (StringUtils.isNotBlank(selectableHtml)) {
+                document.html(selectableHtml);
+
+                Elements allElements = document.children().first().children();
+                for (Element element : allElements) {
+                    String text = element.text();
+                    if (StringUtils.isNotBlank(text)) {
+                        listFragments.add(text.trim());
+                    }
+                }
+            }
+
+
+            page.putField("title", title);
+            page.putField("bodyText", bodyText);
+            page.putField("html", html);
+            page.putField("keywords", keywords);
+            page.putField("description", description);
+            page.putField("listFragments", JSON.toJSONString(listFragments));
+        }
+
+        @Override
+        public Site getSite() {
+            return this.site;
+        }
     }
 }