|
|
@@ -32,189 +32,189 @@ import java.util.regex.Pattern;
|
|
|
@Service
|
|
|
public class PopularFeelingsServiceImpl extends ServiceImpl<PopularFeelingsMapper, PopularFeelings> implements PopularFeelingsService {
|
|
|
|
|
|
- @Autowired
|
|
|
- private PopularFeelingsPageService popularFeelingsPageService;
|
|
|
-
|
|
|
- private static final ConcurrentHashMap<Long, Spider> SPIDER_RUNNING_MAP = new ConcurrentHashMap<>();
|
|
|
-
|
|
|
- @Override
|
|
|
- public void start(PopularFeelings popular) {
|
|
|
- String[] urls = popular.getStartUrls().toArray(new String[]{});
|
|
|
-
|
|
|
- Spider spider = Spider.create(new PopularFeelingsProcessor(popular));
|
|
|
- List<SpiderListener> listListeners = new ArrayList<>();
|
|
|
- listListeners.add(new SpiderListener() {
|
|
|
- @Override
|
|
|
- public void onSuccess(Request request) {
|
|
|
- log.info("onSuccess>>>>>>{}", request.getUrl());
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void onError(Request request) {
|
|
|
- log.info("onError>>>>>>{}", request.getUrl());
|
|
|
- }
|
|
|
- });
|
|
|
- List<Pipeline> pipelines = new ArrayList<>();
|
|
|
- pipelines.add(new Pipeline() {
|
|
|
- @Override
|
|
|
- public void process(ResultItems resultItems, Task task) {
|
|
|
- Map<String, Object> dataMap = resultItems.getAll();
|
|
|
- log.info(resultItems.getRequest().getUrl());
|
|
|
- String url = resultItems.getRequest().getUrl();
|
|
|
- String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
-
|
|
|
- String title = dataMap.getOrDefault("title", "").toString();
|
|
|
- String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
- String description = dataMap.getOrDefault("description", "").toString();
|
|
|
- String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
- String html = dataMap.getOrDefault("html", "").toString();
|
|
|
-
|
|
|
- if (StringUtils.isBlank(popular.getKeywords())) {
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- String[] listKeywords = popular.getKeywords().split(",");
|
|
|
-
|
|
|
- boolean flag = true;
|
|
|
- for (String k : listKeywords) {
|
|
|
- if (!bodyText.contains(k)) {
|
|
|
- flag = false;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (flag) {
|
|
|
- PopularFeelingsPage popularFeelingsPage = new PopularFeelingsPage();
|
|
|
-
|
|
|
- popularFeelingsPage.setId(id);
|
|
|
- popularFeelingsPage.setPopularFeelingsId(popular.getId());
|
|
|
- popularFeelingsPage.setUrl(url);
|
|
|
- popularFeelingsPage.setTitle(title);
|
|
|
- popularFeelingsPage.setKeywords(keywords);
|
|
|
- popularFeelingsPage.setDescription(description);
|
|
|
- popularFeelingsPage.setBodyText(bodyText);
|
|
|
- popularFeelingsPage.setHtml(html);
|
|
|
- popularFeelingsPage.setSpiderTime(DateUtil.current());
|
|
|
- popularFeelingsPage.setUpdateTime(DateUtil.current());
|
|
|
- popularFeelingsPageService.saveOrUpdate(popularFeelingsPage);
|
|
|
- }
|
|
|
- }
|
|
|
- });
|
|
|
- spider.setPipelines(pipelines);
|
|
|
- spider.setExitWhenComplete(true);
|
|
|
- spider.setSpiderListeners(listListeners);
|
|
|
- spider.addUrl(urls);
|
|
|
- spider.runAsync();
|
|
|
-
|
|
|
- SPIDER_RUNNING_MAP.put(popular.getId(), spider);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void stop(PopularFeelings popular) {
|
|
|
- Spider spider = SPIDER_RUNNING_MAP.get(popular.getId());
|
|
|
- if (spider != null && spider.getStatus().equals(Spider.Status.Running)) {
|
|
|
- spider.stop();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public int getStatus(Long popularFeelingsId) {
|
|
|
- Spider spider = SPIDER_RUNNING_MAP.get(popularFeelingsId);
|
|
|
- if (spider == null) {
|
|
|
- return 0;
|
|
|
- } else if (spider.getStatus().equals(Spider.Status.Running)) {
|
|
|
- return 1;
|
|
|
- } else if (spider.getStatus().equals(Spider.Status.Stopped)) {
|
|
|
- return 2;
|
|
|
- } else {
|
|
|
- return 0;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- static class PopularFeelingsProcessor implements PageProcessor {
|
|
|
- private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
-
|
|
|
- private final Site site;
|
|
|
-
|
|
|
- private final PopularFeelings popular;
|
|
|
-
|
|
|
-
|
|
|
- public PopularFeelingsProcessor(PopularFeelings popular) {
|
|
|
- this.popular = popular;
|
|
|
-
|
|
|
- this.site = Site.me();
|
|
|
- this.site.setDomain(this.popular.getDomain());
|
|
|
- this.site.setUseGzip(true);
|
|
|
- this.site.setRetryTimes(3);
|
|
|
- this.site.setSleepTime(1000);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void process(Page page) {
|
|
|
- List<String> listUrls = page.getHtml().links().all();
|
|
|
- listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
-
|
|
|
- String urlPatterns = this.popular.getUrlPatterns();
|
|
|
- if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
- String[] listPatterns = urlPatterns.split("\n");
|
|
|
- listUrls.removeIf(url -> {
|
|
|
- boolean hasMatched = false;
|
|
|
- for (String regex : listPatterns) {
|
|
|
- hasMatched = Pattern.matches(regex, url);
|
|
|
- if (hasMatched) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- return !hasMatched;
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
-
|
|
|
- page.addTargetRequests(listUrls);
|
|
|
-
|
|
|
- Document document = page.getHtml().getDocument();
|
|
|
-
|
|
|
- String title = document.title();
|
|
|
- String bodyText = document.text();
|
|
|
- String html = document.html();
|
|
|
-
|
|
|
- String keywords = "";
|
|
|
- String description = "";
|
|
|
- Elements elements = document.getElementsByTag("meta");
|
|
|
-
|
|
|
- if (elements != null && elements.size() > 0) {
|
|
|
- for (Element element : elements) {
|
|
|
- String metaName = element.attr("name");
|
|
|
- String metaContent = element.attr("content");
|
|
|
- if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
- keywords = metaContent;
|
|
|
- }
|
|
|
- if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
- description = metaContent;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
- description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
- }
|
|
|
-
|
|
|
- if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
- keywords = description;
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- page.putField("title", title);
|
|
|
- page.putField("bodyText", bodyText);
|
|
|
- page.putField("html", html);
|
|
|
- page.putField("keywords", keywords);
|
|
|
- page.putField("description", description);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public Site getSite() {
|
|
|
- return this.site;
|
|
|
- }
|
|
|
- }
|
|
|
+// @Autowired
|
|
|
+// private PopularFeelingsPageService popularFeelingsPageService;
|
|
|
+//
|
|
|
+// private static final ConcurrentHashMap<Long, Spider> SPIDER_RUNNING_MAP = new ConcurrentHashMap<>();
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public void start(PopularFeelings popular) {
|
|
|
+// String[] urls = popular.getStartUrls().toArray(new String[]{});
|
|
|
+//
|
|
|
+// Spider spider = Spider.create(new PopularFeelingsProcessor(popular));
|
|
|
+// List<SpiderListener> listListeners = new ArrayList<>();
|
|
|
+// listListeners.add(new SpiderListener() {
|
|
|
+// @Override
|
|
|
+// public void onSuccess(Request request) {
|
|
|
+// log.info("onSuccess>>>>>>{}", request.getUrl());
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public void onError(Request request) {
|
|
|
+// log.info("onError>>>>>>{}", request.getUrl());
|
|
|
+// }
|
|
|
+// });
|
|
|
+// List<Pipeline> pipelines = new ArrayList<>();
|
|
|
+// pipelines.add(new Pipeline() {
|
|
|
+// @Override
|
|
|
+// public void process(ResultItems resultItems, Task task) {
|
|
|
+// Map<String, Object> dataMap = resultItems.getAll();
|
|
|
+// log.info(resultItems.getRequest().getUrl());
|
|
|
+// String url = resultItems.getRequest().getUrl();
|
|
|
+// String id = DigestUtils.md5Hex(url.replace("http://", "").replace("https://", ""));
|
|
|
+//
|
|
|
+// String title = dataMap.getOrDefault("title", "").toString();
|
|
|
+// String keywords = dataMap.getOrDefault("keywords", "").toString();
|
|
|
+// String description = dataMap.getOrDefault("description", "").toString();
|
|
|
+// String bodyText = dataMap.getOrDefault("bodyText", "").toString();
|
|
|
+// String html = dataMap.getOrDefault("html", "").toString();
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(popular.getKeywords())) {
|
|
|
+// return;
|
|
|
+// }
|
|
|
+//
|
|
|
+// String[] listKeywords = popular.getKeywords().split(",");
|
|
|
+//
|
|
|
+// boolean flag = true;
|
|
|
+// for (String k : listKeywords) {
|
|
|
+// if (!bodyText.contains(k)) {
|
|
|
+// flag = false;
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// if (flag) {
|
|
|
+// PopularFeelingsPage popularFeelingsPage = new PopularFeelingsPage();
|
|
|
+//
|
|
|
+// popularFeelingsPage.setId(id);
|
|
|
+// popularFeelingsPage.setPopularFeelingsId(popular.getId());
|
|
|
+// popularFeelingsPage.setUrl(url);
|
|
|
+// popularFeelingsPage.setTitle(title);
|
|
|
+// popularFeelingsPage.setKeywords(keywords);
|
|
|
+// popularFeelingsPage.setDescription(description);
|
|
|
+// popularFeelingsPage.setBodyText(bodyText);
|
|
|
+// popularFeelingsPage.setHtml(html);
|
|
|
+// popularFeelingsPage.setSpiderTime(DateUtil.current());
|
|
|
+// popularFeelingsPage.setUpdateTime(DateUtil.current());
|
|
|
+// popularFeelingsPageService.saveOrUpdate(popularFeelingsPage);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// });
|
|
|
+// spider.setPipelines(pipelines);
|
|
|
+// spider.setExitWhenComplete(true);
|
|
|
+// spider.setSpiderListeners(listListeners);
|
|
|
+// spider.addUrl(urls);
|
|
|
+// spider.runAsync();
|
|
|
+//
|
|
|
+// SPIDER_RUNNING_MAP.put(popular.getId(), spider);
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public void stop(PopularFeelings popular) {
|
|
|
+// Spider spider = SPIDER_RUNNING_MAP.get(popular.getId());
|
|
|
+// if (spider != null && spider.getStatus().equals(Spider.Status.Running)) {
|
|
|
+// spider.stop();
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public int getStatus(Long popularFeelingsId) {
|
|
|
+// Spider spider = SPIDER_RUNNING_MAP.get(popularFeelingsId);
|
|
|
+// if (spider == null) {
|
|
|
+// return 0;
|
|
|
+// } else if (spider.getStatus().equals(Spider.Status.Running)) {
|
|
|
+// return 1;
|
|
|
+// } else if (spider.getStatus().equals(Spider.Status.Stopped)) {
|
|
|
+// return 2;
|
|
|
+// } else {
|
|
|
+// return 0;
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// static class PopularFeelingsProcessor implements PageProcessor {
|
|
|
+// private final List<String> LIST_SPIDER_URLS = new ArrayList<>();
|
|
|
+//
|
|
|
+// private final Site site;
|
|
|
+//
|
|
|
+// private final PopularFeelings popular;
|
|
|
+//
|
|
|
+//
|
|
|
+// public PopularFeelingsProcessor(PopularFeelings popular) {
|
|
|
+// this.popular = popular;
|
|
|
+//
|
|
|
+// this.site = Site.me();
|
|
|
+// this.site.setDomain(this.popular.getDomain());
|
|
|
+// this.site.setUseGzip(true);
|
|
|
+// this.site.setRetryTimes(3);
|
|
|
+// this.site.setSleepTime(1000);
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public void process(Page page) {
|
|
|
+// List<String> listUrls = page.getHtml().links().all();
|
|
|
+// listUrls.removeIf(url -> StringUtils.isBlank(url) || LIST_SPIDER_URLS.contains(url));
|
|
|
+//
|
|
|
+// String urlPatterns = this.popular.getUrlPatterns();
|
|
|
+// if (StringUtils.isNotBlank(urlPatterns)) {
|
|
|
+// String[] listPatterns = urlPatterns.split("\n");
|
|
|
+// listUrls.removeIf(url -> {
|
|
|
+// boolean hasMatched = false;
|
|
|
+// for (String regex : listPatterns) {
|
|
|
+// hasMatched = Pattern.matches(regex, url);
|
|
|
+// if (hasMatched) {
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// return !hasMatched;
|
|
|
+// });
|
|
|
+// }
|
|
|
+//
|
|
|
+// LIST_SPIDER_URLS.addAll(listUrls);
|
|
|
+//
|
|
|
+// page.addTargetRequests(listUrls);
|
|
|
+//
|
|
|
+// Document document = page.getHtml().getDocument();
|
|
|
+//
|
|
|
+// String title = document.title();
|
|
|
+// String bodyText = document.text();
|
|
|
+// String html = document.html();
|
|
|
+//
|
|
|
+// String keywords = "";
|
|
|
+// String description = "";
|
|
|
+// Elements elements = document.getElementsByTag("meta");
|
|
|
+//
|
|
|
+// if (elements != null && elements.size() > 0) {
|
|
|
+// for (Element element : elements) {
|
|
|
+// String metaName = element.attr("name");
|
|
|
+// String metaContent = element.attr("content");
|
|
|
+// if (StringUtils.equalsIgnoreCase("keywords", metaName)) {
|
|
|
+// keywords = metaContent;
|
|
|
+// }
|
|
|
+// if (StringUtils.equalsIgnoreCase("description", metaName)) {
|
|
|
+// description = metaContent;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(description) && StringUtils.isNotBlank(bodyText)) {
|
|
|
+// description = bodyText.length() >= 200 ? bodyText.substring(0, 200) : bodyText;
|
|
|
+// }
|
|
|
+//
|
|
|
+// if (StringUtils.isBlank(keywords) && StringUtils.isNotBlank(description)) {
|
|
|
+// keywords = description;
|
|
|
+// }
|
|
|
+//
|
|
|
+//
|
|
|
+// page.putField("title", title);
|
|
|
+// page.putField("bodyText", bodyText);
|
|
|
+// page.putField("html", html);
|
|
|
+// page.putField("keywords", keywords);
|
|
|
+// page.putField("description", description);
|
|
|
+// }
|
|
|
+//
|
|
|
+// @Override
|
|
|
+// public Site getSite() {
|
|
|
+// return this.site;
|
|
|
+// }
|
|
|
+// }
|
|
|
}
|