Extend Nutch2 to Get Outlinks from COOLjsTree Javascript File


We use Nutch2 to crawl one documentation site, and store the index to Solr4.x to implement documentation search function.

But I met one problem: the documentation site uses COOLjsTree, in htmp paghes it defines the left side menu in tree_nodes.js. 
END_USER: {
  NODES: [
   ["End User 1", "../../products/end_user1.htm", "_top"],
   ["End User 2", "../../products/end_user2.htm", "_top"],
  ],
  TITLE: " End-User"
}
Nutch2 provides parse-js plugin to find outlinks defined in javascript file or embedded javascript section. 
It uses the following regular expression to find outlinks:
org.apache.nutch.parse.js.JSParseFilter
  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
It can find links like: http://site.com/folder/pagea.html. But it doesn't work for the links we defined in our tree_nodes.js.

But luckily, we can easily write our own Nutch plugin to modify or extend nutch.

We can create our own parse-tree-nodes-js plugin, write our own ParseFilter and Parser to parse outlinks from our tree_nodes.js file.
Implementation Code
First check whether it is a javascript file end with tree_nodes.js, if so get links from the file via the regulare pattern like below: "*.htm|html|pdf"

  private static final String URL_PATTERN_IN_TREE_NODE_JS = "\"([^\"]*.[htm|html|pdf])\"";

package org.jefferyyuan.codeexample.nutch.parse.js.treenodes;

public class TreeNodesJSParseFilter implements ParseFilter, Parser {
  private static final int MAX_TITLE_LEN = 80;
  private static final String ABSOLUTE_URL_PATTERN_STR = "^[http|https|www].*";
  private static final String CV_TREE_NODE_LINK_PATTERN_STR = "\"([^\"]*.[htm|html|pdf])\"";
  private static final PatternCompiler patternCompiler = new Perl5Compiler();
  private static Pattern ABSOLUTE_URL_PATTERN, CV_TREE_NODE_LINK_PATTERN;

  static {
    try {
      ABSOLUTE_URL_PATTERN = patternCompiler.compile(ABSOLUTE_URL_PATTERN_STR,
          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
              | Perl5Compiler.SINGLELINE_MASK);
      CV_TREE_NODE_LINK_PATTERN = patternCompiler.compile(
          CV_TREE_NODE_LINK_PATTERN_STR, Perl5Compiler.CASE_INSENSITIVE_MASK
              | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
    } catch (MalformedPatternException e) {
      e.printStackTrace();
    }
  }
  @Override
  public Parse filter(String url, WebPage page, Parse parse,
      HTMLMetaTags metaTags, DocumentFragment doc) {
    if (shouldHandle(page)) {
      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();

      walk(doc, parse, metaTags, url, outlinks);
      if (outlinks.size() > 0) {
        Outlink[] old = parse.getOutlinks();
        String title = parse.getTitle();
        List<Outlink> list = Arrays.asList(old);
        outlinks.addAll(list);
        ParseStatus status = parse.getParseStatus();
        String text = parse.getText();
        Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
        return new Parse(text, title, newlinks, status);
      }
    }
    return parse;
  }

  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
      List<Outlink> outlinks) {
    if (n instanceof Element) {
      String name = n.getNodeName();
      if (name.equalsIgnoreCase("script")) {
        @SuppressWarnings("unused")
        String lang = null;
        Node lNode = n.getAttributes().getNamedItem("language");
        if (lNode == null)
          lang = "javascript";
        else
          lang = lNode.getNodeValue();
        StringBuffer script = new StringBuffer();
        NodeList nn = n.getChildNodes();
        if (nn.getLength() > 0) {
          for (int i = 0; i < nn.getLength(); i++) {
            if (i > 0)
              script.append('\n');
            script.append(nn.item(i).getNodeValue());
          }
          // This logging makes the output very messy.
          // if (LOG.isInfoEnabled()) {
          // LOG.info("script: language=" + lang + ", text: " +
          // script.toString());
          // }
          Outlink[] links = getJSLinks(script.toString(), "", base);
          if (links != null && links.length > 0)
            outlinks.addAll(Arrays.asList(links));
          // no other children of interest here, go one level up.
          return;
        }
      } else {
        // process all HTML 4.0 events, if present...
        NamedNodeMap attrs = n.getAttributes();
        int len = attrs.getLength();
        for (int i = 0; i < len; i++) {
          // Window: onload,onunload
          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
          // Keyboard: onkeydown,onkeypress,onkeyup
          // Mouse:
          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
          Node anode = attrs.item(i);
          Outlink[] links = null;
          if (anode.getNodeName().startsWith("on")) {
            links = getJSLinks(anode.getNodeValue(), "", base);
          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
            String val = anode.getNodeValue();
            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
              links = getJSLinks(val, "", base);
            }
          }
          if (links != null && links.length > 0)
            outlinks.addAll(Arrays.asList(links));
        }
      }
    }
    NodeList nl = n.getChildNodes();
    for (int i = 0; i < nl.getLength(); i++) {
      walk(nl.item(i), parse, metaTags, base, outlinks);
    }
  }

  private boolean shouldHandle(WebPage page) {
    boolean shouldHandle = false;

    String url = TableUtil.toString(page.getBaseUrl());
    if (url != null && url.endsWith("tree_nodes.js")) {
      shouldHandle = true;
    }
    return shouldHandle;
  }

  @Override
  public Parse getParse(String url, WebPage page) {
    if (!shouldHandle(page)) {
      return ParseStatusUtils.getEmptyParse(
          ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
              + TableUtil.toString(page.getContentType()) + "'", getConf());
    }
    String script = new String(page.getContent().array());
    Outlink[] outlinks = getJSLinks(script, "", url);
    if (outlinks == null)
      outlinks = new Outlink[0];
    // Title? use the first line of the script...
    String title;
    int idx = script.indexOf('\n');
    if (idx != -1) {
      if (idx > MAX_TITLE_LEN)
        idx = MAX_TITLE_LEN;
      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    Parse parse = new Parse(script, title, outlinks,
        ParseStatusUtils.STATUS_SUCCESS);
    return parse;
  }

  /**
   * This method extracts URLs from literals embedded in JavaScript.
   */
  private static Outlink[] getJSLinks(String plainText, String anchor,
      String base) {
    long start = System.currentTimeMillis();

    // the base is always absolute path: http://.../tree_nodes.js, remve last file name
    base = base.substring(0, base.lastIndexOf('/'));
    final List<Outlink> outlinks = new ArrayList<Outlink>();
    URL baseURL = null;

    try {
      baseURL = new URL(base);
    } catch (Exception e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("error assigning base URL", e);
      }
    }

    try {
      final PatternMatcher matcher = new Perl5Matcher();
      final PatternMatcherInput input = new PatternMatcherInput(plainText);

      MatchResult result;
      String url;
      // loop the matches
      while (matcher.contains(input, CV_TREE_NODE_LINK_PATTERN)) {
        // if this is taking too long, stop matching
        // (SHOULD really check cpu time used so that heavily loaded systems
        // do not unnecessarily hit this limit.)
        if (System.currentTimeMillis() - start >= 60000L) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Time limit exceeded for getJSLinks");
          }
          break;
        }
        result = matcher.getMatch();
        url = result.group(1);
        // See if candidate URL is parseable. If not, pass and move on to
        // the next match.
        try {
          url = new URL(toAbsolutePath(base, url)).toString();
          LOG.info("Extension added: " + url + " and baseURL " + baseURL);
        } catch (MalformedURLException ex) {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Extension - failed URL parse '" + url + "' and baseURL '"
              + baseURL + "'", ex);
          }
          continue;
        }
        try {
          outlinks.add(new Outlink(url.toString(), anchor));
        } catch (MalformedURLException mue) {
          LOG.warn("Extension Invalid url: '" + url + "', skipping.");
        }
      }
    } catch (Exception ex) {
      if (LOG.isErrorEnabled()) {
        LOG.error("getJSLinks", ex);
      }
    }

    final Outlink[] retval;

    // create array of the Outlinks
    if (outlinks != null && outlinks.size() > 0) {
      retval = outlinks.toArray(new Outlink[0]);
    } else {
      retval = new Outlink[0];
    }

    return retval;
  }

  private static String toAbsolutePath(String baseUrl, String path)
      throws MalformedPatternException {
    final PatternMatcher matcher = new Perl5Matcher();

    final PatternMatcherInput input = new PatternMatcherInput(path);
    boolean isAbsolute = false;

    if (matcher.matches(input, ABSOLUTE_URL_PATTERN)) {
      isAbsolute = true;
    }

    if (isAbsolute) {
      return path;
    }
    while (true) {
      if (!path.startsWith("../")) {
        break;
      }
      baseUrl = baseUrl.substring(0, baseUrl.lastIndexOf('/'));
      path = path.substring(3);
    }
    // now relativePath is foldera/fileb, no /

    return baseUrl + "/" + path;
  }
}
Configuration
Then we need to include parse-tree-nodes-js in nutch-site.xml

    plugin.includes
    protocol-http|urlfilter-regex|parse-tree-nodes-js|parse-(html|tika|metatags)|index-(basic|static|metadata|anchor)
|urlnormalizer-(pass|regex|basic)|scoring-opic|subcollection
    

Then change parse-plugins.xml to make nutch use parse-tree-nodes-js plugin to parse javascript file.

 


 


 

Then we need change regex-urlfilter.txt to make nutch handle javascript file: to remove |js|JS from the following section.
#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
At last, as we don't need store content from javascript file to Solr, we can either write a Solr UpdateRequestProcessor to ignore the document if the value of url field is ended with .js, or we can change org.apache.nutch.indexer.solr.SolrWriter.write(NutchDocument) like below:
public void write(NutchDocument doc) throws IOException {
    String urlValue = doc.getFieldValue("url");
    if(urlValue!=null && urlValue.endsWith(".js"))
    {
      LOG.trace("Extension ignore js file: " + urlValue);
      return;
    }
...
}
References
http://wiki.apache.org/nutch/AboutPlugins
http://wiki.apache.org/nutch/WritingPluginExample
http://florianhartl.com/nutch-plugin-tutorial.html

Labels

adsense (5) Algorithm (69) Algorithm Series (35) Android (7) ANT (6) bat (8) Big Data (7) Blogger (14) Bugs (6) Cache (5) Chrome (19) Code Example (29) Code Quality (7) Coding Skills (5) Database (7) Debug (16) Design (5) Dev Tips (63) Eclipse (32) Git (5) Google (33) Guava (7) How to (9) Http Client (8) IDE (7) Interview (88) J2EE (13) J2SE (49) Java (186) JavaScript (27) JSON (7) Learning code (9) Lesson Learned (6) Linux (26) Lucene-Solr (112) Mac (10) Maven (8) Network (9) Nutch2 (18) Performance (9) PowerShell (11) Problem Solving (11) Programmer Skills (6) regex (5) Scala (6) Security (9) Soft Skills (38) Spring (22) System Design (11) Testing (7) Text Mining (14) Tips (17) Tools (24) Troubleshooting (29) UIMA (9) Web Development (19) Windows (21) xml (5)