#!/usr/bin/env perl6
use v6;

use PDF::Class;
use PDF::Catalog;
use PDF::StructTreeRoot;
use PDF::Tags::Reader;
use PDF::Tags::XML-Writer;
use PDF::Tags::Node :TagName;
use PDF::IO;

subset Number of Int where * > 0;

sub MAIN(Str $infile,               #= input PDF
	 Str     :$password = '',   #= password for the input PDF, if encrypted
         Number  :$max-depth = 16,  #= depth to ascend/descend struct tree
         Bool    :$atts = True,     #= include attributes in tags
         Bool    :$debug,           #= write extra debugging information
         Bool    :$marks,           #= descend into marked content
         Bool    :$strict = True,   #= warn about unknown tags, etc
         Bool    :$style = True,    #= include stylesheet header
         Str     :$select,          #= XPath of twigs to include (relative to root)
         TagName :$omit,            #= Tags to omit from output
         TagName :$root-tag,        #= Outer root tag name
        ) {

    my PDF::IO $input .= coerce(
       $infile eq '-'
           ?? $*IN.slurp-rest( :bin ) # sequential access
           !! $infile.IO              # random access
    );

    my PDF::Class $pdf .= open( $input, :$password );
    my PDF::Tags::Reader $dom .= read: :$pdf, :$strict, :$marks;
    my PDF::Tags::XML-Writer $xml .= new: :$max-depth, :$atts, :$debug, :$omit, :$style, :$root-tag, :$marks;

    my PDF::Tags::Node @nodes = do with $select {
        $dom.find($_);
    }
    else {
        $dom.root;
    }

    my UInt $depth = 0;

    with $root-tag {
        unless @nodes[0] ~~ PDF::Tags:D {
            say '<' ~ $_ ~ '>';
            $depth++;
        }
    }

    $xml.say($*OUT, $_, :$depth) for @nodes;

    say '</' ~ $root-tag ~ '>' if $depth;
}



head

Options:
   --password          password for an encrypted PDF
   --max-depth=n       maximum tag-depth to descend
   --select=XPath      nodes to be included
   --omit=tag-name     nodes to be excluded
   --root-tag=tag-name define outer root tag
   --marks             decend into marked content
   --debug             add debugging to output
   --/atts             omit attributes in tags
   --/strict           suppress warnings
   --/style            omit root stylesheet link


Dumps structure elements from a tagged PDF.


Produces tagged output in an XML format.


Only some PDF files contain tagged PDF. pdf-info.raku can be
used to check this:


% pdf-info.raku my-doc.pdf | grep Tagged:
Tagged:     yes


This script requires the freetype6 native library and the PDF::Font::Loader
Raku module to be installed on your system.


item

root

index

README

React

=begin React :component<HeaderCol> :id<menu> 
=para L<Documentation|/doc/introduction>
=para L<Modules| /mods>
=para L<Examples| file:../examples/index.podlite>
=para L<Download |file:../download/index.podlite>
=para L<About|file:../about/index.podlite>
=para 🔍 K<⌘K>/K<ctrl-K>
=end React


=begin React :component<HeaderCol> :id<footer>  
=begin nested :!nested
=item1 B<Language>
    =item2 L<Get started|file:../getting-started/getting-started.podlite>
    =item2 L<Why Raku?|/doc/language/faq#Why-should-I-learn-Raku-What's-so-great-about-it>
    =item2 L<Try Raku|https://glot.io/new/raku>
    =item2 L<Raku cheat sheet|https://github.com/Raku/mu/blob/master/docs/Perl6/Cheatsheet/cheatsheet.txt>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<Raku on Exercism|https://exercism.org/tracks/raku>
    =item2 L<Wikipedia|https://en.wikipedia.org/wiki/Raku_(programming_language)>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=begin nested :!nested

=item1 B<L<Documentation|/doc>>
    =item2 L<Getting started, Migration guides from other languages, & Tutorials | file:../doc/introduction.podlite>
    =item2 L<Language References|file:../doc/reference.podlite>
    =item2 L<Type Reference| file:../doc/types.podlite>
    =item2 L<Miscellaneous| file:../doc/miscellaneous.podlite>
    =item2 L<FAQs (Frequently Asked Questions)|/doc/language/faq>
    =item2 L<Community|/doc/language/community>
    =item2 L<The list of all documents|file:../doc/index.podlite>

=item1 B<Resources>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>

=end nested
=begin nested :!nested

=item1 B<Resoures>
    =item2 L<Download | file:../download/index.podlite>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=item1 B<Learning>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Wikibook |https://en.wikibooks.org/wiki/Raku_Programming>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>
    =item2 L<Learn Raku in Y minutes|https://learnxinyminutes.com/docs/raku/>
    =item2 L<Golfing |https://github.com/AlexDaniel/raku-golf-cheatsheet>
=end nested
=begin nested :!nested

=item1 B<Explore>
    =item2 L<Raku Blog Aggregator|https://planet.raku.org/>
    =item2 L<Rakudo Weekly|https://rakudoweekly.blog/>
    =item2 L<The Weekly Challenge |https://perlweeklychallenge.org/>
    =item2 L<Raku Advent Calendar|https://raku-advent.blog/>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=end React


=begin React :component<CookieConsent> :id<CookieConsent> :buttonCaption("Got it!")
=para
This website uses cookies for analytics.
=end React


pdf-tag-dump

SYNOPSIS

DESCRIPTION

DEPENDENCIES

BUGS AND LIMITATIONS

TODO

PDF::Tags::Reader v0.0.4

Authors

License

Dependencies

Test Dependencies

Provides

Documentation