HTML-Miner

NAME 

	HTML::Miner - This Module 'Mines' (hopefully) useful information for an URL or HTML snippet.

VERSION 

	Version 0.05

SYNOPSIS 

	HTML::Miner 'Mines' (hopefully) useful information for an URL or HTML snippet. The following is a list of HTML elements that can be extracted:

	Find all links and for each link extract:

	     URL Title
	     URL href
	     URL Anchor Text
	     URL Domain
	     URL Protocol
	     URL URI
	     URL Absolute location

	Find all images and for each image extract:

	     IMG Source URL
	     IMG Absolute Source URL
	     IMG Source Domain

        Extracts Meta Elements such as

	     Page Title
	     Page Description
	     Page Keywords
	     Page RSS Feeds

        Finds the final destination URL of a potentially redirecting URL.

        Find all JS and CSS files used withing the HTML and find their absolute URL if required.



	Example ( Object Oriented Usage )

	     use HTML::Miner;

	     my $html = "some html";
    	     # or $html = do{local $/;<DATA>}; with __DATA__ provided

    	     my $foo = HTML::Miner->new ( 
      	     	CURRENT_URL                   => 'www.perl.org'   , 
      		CURRENT_URL_HTML              => $html 
    	     );


    	     my $meta_data =  $html_miner->get_meta_elements() ;
	     my $links     = $html_miner->get_links()          ;
    	     my $images    = $html_miner->get_images()         ;

    	     my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();  

	     my $css_and_js =  $html_miner->get_page_css_and_js()  ;

    	     my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;

    	     my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );


	Example ( Direct access of Methods )

             use HTML::Miner;

    	     my $html = "some html";
    	     # or $html = do{local $/;<DATA>}; with __DATA__ provided

    	     my $url = "http://www.perl.org";

    	     my $meta_data  =  HTML::Miner::get_meta_elements( $url, $html ) ;
    	     my $links      = HTML::Miner::get_links( $url, $html )          ;
    	     my $images     = HTML::Miner::get_images( $url, $html )         ;

    	     my ( $clear_url, $protocol, $domain, $uri ) = HTML::Minerbreak_url( $url );  

	     my $css_and_js = get_page_css_and_js( 
             	URL                       =>    $url                     , 
		HTML                      =>    $optionally_html_of_url  ,   
           	CONVERT_URLS_TO_ABS       =>    0/1                      ,  [ Optional argument, default is 1 ]
             );	     

    	     my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ;

    	     my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );


	Testing HTML

		    __DATA__

		      <html>
		         <head>
			     <title>SiteTitle</title>
			     <meta name="description" content="desc of site" />
			     <meta name="keywords"    content="kw1, kw2, kw3" />
          		     <link rel="alternate" type="application/atom+xml" title="Title" href="http://www.my_domain_to_mine.com/feed/atom/" />
          		     <link rel="alternate" type="application/rss+xml" title="Title" href="http://www.othersite.com/feed/" />
          		     <link rel="alternate" type="application/rdf+xml" title="Title" href="my_domain_to_mine.com/feed/" /> 
          		     <link rel="alternate" type="text/xml" title="Title" href="http://www.other.org/feed/rss/" />
			     <script type="text/javascript" src="http://static.myjsdomain.com/frameworks/barlesque.js"></script>
          		     <script type="text/javascript" src="http://js.revsci.net/gateway/gw.js?csid=J08781"></script>
          		     <script type="text/javascript" src="/about/other.js"></script>
          		     <link rel="stylesheet" type="text/css" href="http://static.mycssdomain.com/frameworks/style/main.css";  />
      			 </head>
      			 <body>
      
		             <a href="http://linkone.com">Link1</a>
      			     <a href="link2.html" TITLE="title2" >Link2</a>
      			     <a href="/link3">Link3</a>
      
      
			     <img src="http://my_domain_to_mine.com/logo_plain.jpg" >
      			     <img alt="image2" src="http://my_domain_to_mine.com/image2.jpg" />
      			     <img src="http://my_other.com/image3.jpg" alt="link3">
      			     <img src="image3.jpg" alt="link3">
      
      
			</body>
      		     </html>


	Example Outputs

    		my $meta_data =  $html_miner->get_meta_elements() ;

		# $meta_data->{ TITLE }             =>   "SiteTitle"
    		# $meta_data->{ DESC }              =>   "desc of site"
    		# $meta_data->{ KEYWORDS }->[0]     =>   "kw1"
    		# $meta_data->{ RSS }->[0]->{TYPE}  =>   "application/atom+xml"



    		my $links = $html_miner->get_links();

    		# $links->[0]->{ DOMAIN }         =>   "linkone.com"
    		# $links->[0]->{ ANCHOR }         =>   "Link1"
    		# $links->[2]->{ ABS_URL   }      =>   "http://my_domain_to_mine.com/link3"
    		# $links->[1]->{ DOMAIN_IS_BASE } =>   1
    		# $links->[1]->{ TITLE }          =>   "title2"



    		my $images = $html_miner->get_images();

    		# $images->[0]->{ IMG_LOC }     =>  "http://my_domain_to_mine.com/logo_plain.jpg"
    		# $images->[2]->{ ALT }         =>  "link3"
    		# $images->[0]->{ IMG_DOMAIN }  =>  "my_domain_to_mine.com"
    		# $images->[3]->{ ABS_LOC }     =>  "http://my_domain_to_mine.com/image3.jpg"




		my $css_and_js =  $html_miner->get_page_css_and_js(
         	   CONVERT_URLS_TO_ABS       =>    0
    		);

		# $css_and_js will contain:
    		#    {
    		#      CSS => [
    		#         "http://static.mycssdomain.com/frameworks/style/main.css";,
    		#         "/rel_cssfile.css",
    		#        ],
    		#      JS  => [
    		#          "http://static.myjsdomain.com/frameworks/barlesque.js";,
    		#          "http://js.revsci.net/gateway/gw.js?csid=J08781";,
    		#          "/about/rel_jsfile.js",
    		#        ],
    		#    }

    		my $css_and_js =  $html_miner->get_page_css_and_js(
         	   CONVERT_URLS_TO_ABS       =>    1
    		);

		# $css_and_js will contain:
    		#    {
    		#      CSS => [
    		#         "http://static.mycssdomain.com/frameworks/style/main.css";,
    		#         "http://www.perl.org/rel_cssfile.css";,
   		#        ],
    		#      JS  => [
    		#          "http://static.myjsdomain.com/frameworks/barlesque.js";,
    		#          "http://js.revsci.net/gateway/gw.js?csid=J08781";,
    		#          "http://www.perl.org/about/rel_jsfile.js";,
    		#        ],
    		#    }



    		my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url();  

    		# $clear_url   =>  "http://my_domain_to_mine.com/my_page_to_mine.pl"
    		# $protocol    =>  "http"
    		# $domain      =>  "my_domain_to_mine.com"
    		# $uri         =>  "/my_page_to_mine.pl"



    		HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) => 'redirected_to'

    		my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" );
    		# $out    => "http://www.perl.com/about/"

    		$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/index.html", "index2.html" );
    		# $out    => "http://www.perl.com/help/faq/index2.html"

    		$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../index.html" );
    		# $out    => "http://www.perl.com/index.html"

    		$out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "/about/" );
    		# $out    => "http://www.perl.com/about/"

    		$out = HTML::Miner::get_absolute_url( "www.perl.comhelp/faq/", "http://othersite.com" );
    		# $out    => "http://othersite.com/"



EXPORT 

This Module does not export anything through @EXPORT, however does export the following functions through @EXPORT_OK

     get_links
     get_absolute_url
     break_url
     get_redirect_destination
     get_images
     get_meta_elements
     get_page_css_and_js


INSTALLATION

To install this module, run the following commands:

	perl Makefile.PL
	make
	make test
	make install

SUPPORT AND DOCUMENTATION

After installing, you can find documentation for this module with the
perldoc command.

    perldoc HTML::Miner

You can also look for information at:

    RT, CPAN's request tracker
        http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Miner

    AnnoCPAN, Annotated CPAN documentation
        http://annocpan.org/dist/HTML-Miner

    CPAN Ratings
        http://cpanratings.perl.org/d/HTML-Miner

    Search CPAN
        http://search.cpan.org/dist/HTML-Miner/


COPYRIGHT AND LICENCE

Copyright (C) 2009 4am Design and Technology Labs Pvt. Ltd., all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.