Menu

Sunday, June 5, 2011

Parse a html file using HTMLParser

HTMLParser API is one of the powerful parser.   Download the HTML Parser from the site below

Autocad Designer RAMTESA United Arab Emirates 31-May-2011
OIL AND GAS DOWNSTREAM SALES ENGINE RAMTESA United Arab Emirates 31-May-2011


Here is the java class which parses above html.

public class HtmlParserMain {
    public static void main (String[] args) throws Exception{
        FileInputStream fis =
                new FileInputStream (new File("C:/Akila/test.html"));
        //Create parser
        Parser parser = new Parser(new Lexer(new Page(fis, "UTF-8")));

        //Get specific node: Here I want only the tr nodes
        NodeFilter filter =
                    new AndFilter(
                        new TagNameFilter("tr"),
                        new HasAttributeFilter("class"));

        //Parse the html
        NodeList list = parser.parse(filter);

        //Get the iterator
        SimpleNodeIterator iterator = list.elements();

        //Iterate Table rows
        while (iterator.hasMoreNodes()) {
            //Get the TR node
            TagNode node = (TagNode)iterator.nextNode();
            NodeList tdList = new NodeList ();
            //Get the TD nodes
            node.collectInto(tdList, new TagNameFilter("td"));

            System.out.print(tdList.elementAt(0).toPlainTextString() + " : ");
            System.out.print(tdList.elementAt(1).toPlainTextString() + " : ");
            System.out.print(tdList.elementAt(2).toPlainTextString() + " : ");
            System.out.println(tdList.elementAt(3).toPlainTextString());
        }
               
        //If you want to parse the html directly from the web
        //Parser parse = new Parser ("http://www.google.com");
    }
}