HTML-Miner NAME HTML::Miner - This Module 'Mines' (hopefully) useful information for an URL or HTML snippet. VERSION Version 0.05 SYNOPSIS HTML::Miner 'Mines' (hopefully) useful information for an URL or HTML snippet. The following is a list of HTML elements that can be extracted: Find all links and for each link extract: URL Title URL href URL Anchor Text URL Domain URL Protocol URL URI URL Absolute location Find all images and for each image extract: IMG Source URL IMG Absolute Source URL IMG Source Domain Extracts Meta Elements such as Page Title Page Description Page Keywords Page RSS Feeds Finds the final destination URL of a potentially redirecting URL. Find all JS and CSS files used withing the HTML and find their absolute URL if required. Example ( Object Oriented Usage ) use HTML::Miner; my $html = "some html"; # or $html = do{local $/;<DATA>}; with __DATA__ provided my $foo = HTML::Miner->new ( CURRENT_URL => 'www.perl.org' , CURRENT_URL_HTML => $html ); my $meta_data = $html_miner->get_meta_elements() ; my $links = $html_miner->get_links() ; my $images = $html_miner->get_images() ; my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url(); my $css_and_js = $html_miner->get_page_css_and_js() ; my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ; my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" ); Example ( Direct access of Methods ) use HTML::Miner; my $html = "some html"; # or $html = do{local $/;<DATA>}; with __DATA__ provided my $url = "http://www.perl.org"; my $meta_data = HTML::Miner::get_meta_elements( $url, $html ) ; my $links = HTML::Miner::get_links( $url, $html ) ; my $images = HTML::Miner::get_images( $url, $html ) ; my ( $clear_url, $protocol, $domain, $uri ) = HTML::Minerbreak_url( $url ); my $css_and_js = get_page_css_and_js( URL => $url , HTML => $optionally_html_of_url , CONVERT_URLS_TO_ABS => 0/1 , [ Optional argument, default is 1 ] ); my $out = HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) ; my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" ); Testing HTML __DATA__ <html> <head> <title>SiteTitle</title> <meta name="description" content="desc of site" /> <meta name="keywords" content="kw1, kw2, kw3" /> <link rel="alternate" type="application/atom+xml" title="Title" href="http://www.my_domain_to_mine.com/feed/atom/" /> <link rel="alternate" type="application/rss+xml" title="Title" href="http://www.othersite.com/feed/" /> <link rel="alternate" type="application/rdf+xml" title="Title" href="my_domain_to_mine.com/feed/" /> <link rel="alternate" type="text/xml" title="Title" href="http://www.other.org/feed/rss/" /> <script type="text/javascript" src="http://static.myjsdomain.com/frameworks/barlesque.js"></script> <script type="text/javascript" src="http://js.revsci.net/gateway/gw.js?csid=J08781"></script> <script type="text/javascript" src="/about/other.js"></script> <link rel="stylesheet" type="text/css" href="http://static.mycssdomain.com/frameworks/style/main.css"; /> </head> <body> <a href="http://linkone.com">Link1</a> <a href="link2.html" TITLE="title2" >Link2</a> <a href="/link3">Link3</a> <img src="http://my_domain_to_mine.com/logo_plain.jpg" > <img alt="image2" src="http://my_domain_to_mine.com/image2.jpg" /> <img src="http://my_other.com/image3.jpg" alt="link3"> <img src="image3.jpg" alt="link3"> </body> </html> Example Outputs my $meta_data = $html_miner->get_meta_elements() ; # $meta_data->{ TITLE } => "SiteTitle" # $meta_data->{ DESC } => "desc of site" # $meta_data->{ KEYWORDS }->[0] => "kw1" # $meta_data->{ RSS }->[0]->{TYPE} => "application/atom+xml" my $links = $html_miner->get_links(); # $links->[0]->{ DOMAIN } => "linkone.com" # $links->[0]->{ ANCHOR } => "Link1" # $links->[2]->{ ABS_URL } => "http://my_domain_to_mine.com/link3" # $links->[1]->{ DOMAIN_IS_BASE } => 1 # $links->[1]->{ TITLE } => "title2" my $images = $html_miner->get_images(); # $images->[0]->{ IMG_LOC } => "http://my_domain_to_mine.com/logo_plain.jpg" # $images->[2]->{ ALT } => "link3" # $images->[0]->{ IMG_DOMAIN } => "my_domain_to_mine.com" # $images->[3]->{ ABS_LOC } => "http://my_domain_to_mine.com/image3.jpg" my $css_and_js = $html_miner->get_page_css_and_js( CONVERT_URLS_TO_ABS => 0 ); # $css_and_js will contain: # { # CSS => [ # "http://static.mycssdomain.com/frameworks/style/main.css";, # "/rel_cssfile.css", # ], # JS => [ # "http://static.myjsdomain.com/frameworks/barlesque.js";, # "http://js.revsci.net/gateway/gw.js?csid=J08781";, # "/about/rel_jsfile.js", # ], # } my $css_and_js = $html_miner->get_page_css_and_js( CONVERT_URLS_TO_ABS => 1 ); # $css_and_js will contain: # { # CSS => [ # "http://static.mycssdomain.com/frameworks/style/main.css";, # "http://www.perl.org/rel_cssfile.css";, # ], # JS => [ # "http://static.myjsdomain.com/frameworks/barlesque.js";, # "http://js.revsci.net/gateway/gw.js?csid=J08781";, # "http://www.perl.org/about/rel_jsfile.js";, # ], # } my ( $clear_url, $protocol, $domain, $uri ) = $html_miner->break_url(); # $clear_url => "http://my_domain_to_mine.com/my_page_to_mine.pl" # $protocol => "http" # $domain => "my_domain_to_mine.com" # $uri => "/my_page_to_mine.pl" HTML::Miner::get_redirect_destination( "redirectingurl_here.html" ) => 'redirected_to' my $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../about/" ); # $out => "http://www.perl.com/about/" $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/index.html", "index2.html" ); # $out => "http://www.perl.com/help/faq/index2.html" $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "../../index.html" ); # $out => "http://www.perl.com/index.html" $out = HTML::Miner::get_absolute_url( "www.perl.com/help/faq/", "/about/" ); # $out => "http://www.perl.com/about/" $out = HTML::Miner::get_absolute_url( "www.perl.comhelp/faq/", "http://othersite.com" ); # $out => "http://othersite.com/" EXPORT This Module does not export anything through @EXPORT, however does export the following functions through @EXPORT_OK get_links get_absolute_url break_url get_redirect_destination get_images get_meta_elements get_page_css_and_js INSTALLATION To install this module, run the following commands: perl Makefile.PL make make test make install SUPPORT AND DOCUMENTATION After installing, you can find documentation for this module with the perldoc command. perldoc HTML::Miner You can also look for information at: RT, CPAN's request tracker http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Miner AnnoCPAN, Annotated CPAN documentation http://annocpan.org/dist/HTML-Miner CPAN Ratings http://cpanratings.perl.org/d/HTML-Miner Search CPAN http://search.cpan.org/dist/HTML-Miner/ COPYRIGHT AND LICENCE Copyright (C) 2009 4am Design and Technology Labs Pvt. Ltd., all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.