Search notes:

Perl module HTML::LinkExtractor

#!/usr/bin/perl
use warnings;
use strict;

use HTML::LinkExtractor;

my $base_url = 'http://server.xyz/path/to/baseurl.html';

my $html = <<HTML;
<!doctype html>
<head>
  <title>Simple HTML page</title>
</head>
<body>

  <h1>Heading</h1>

  <p>Some text with a <a href="http://renenyffenegger.ch/">link to my homepage</a>.</p>

  There are two more links: <ul>
    <li>A <a href="../relative.html">relative link</a> and
    <li>one to <a href="file:///home/rene/test/foo.html">a local page</a>.
  </ul>

  <p><img src='picture.gif'>

</body>
</html>
HTML

my $extractor = HTML::LinkExtractor->new(\&link_found, $base_url);
$extractor -> parse(\$html);

sub link_found {
  my $extractor = shift; # not needed here: reference to extractor object
  my $link      = shift;

  my $link_to;
  my $text = '';

  if ($link->{tag} eq 'a') {
    $link_to = $link -> {href};
    $text    = $link -> {_TEXT};
  }
  elsif ($link->{tag} eq 'img') {
    $link_to = $link -> {src};
  }
  else {
    print "unhandled tag $link->{tag}\n";
    return;
  }


  printf "Found link: %-50s %s\n", $link_to, $text;

}
Github repository PerlModules, path: /HTML/LinkExtractor/script.pl
This script prints
Found link: http://renenyffenegger.ch/                         <a href="http://renenyffenegger.ch/">link to my homepage</a>
Found link: http://server.xyz/path/relative.html               <a href="../relative.html">relative link</a>
Found link: file:///home/rene/test/foo.html                    <a href="file:///home/rene/test/foo.html">a local page</a>
Found link: http://server.xyz/path/to/picture.gif              

See also

download-content-of-linked-pages.pl
Perl modules

Index

Fatal error: Uncaught PDOException: SQLSTATE[HY000]: General error: 8 attempt to write a readonly database in /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php:78 Stack trace: #0 /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php(78): PDOStatement->execute(Array) #1 /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php(30): insert_webrequest_('/notes/developm...', 1741108574, '18.119.113.14', 'Mozilla/5.0 App...', NULL) #2 /home/httpd/vhosts/renenyffenegger.ch/httpsdocs/notes/development/languages/Perl/modules/HTML/LinkExtractor/index(103): insert_webrequest() #3 {main} thrown in /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php on line 78