head

code

item

root

para

L<Examples| file:../examples/index.podlite>


L<Download |file:../download/index.podlite>


React

=begin React :component<HeaderCol> :id<menu> 
=para L<Documentation|/doc/introduction>
=para L<Modules| /mods>
=para L<Examples| file:../examples/index.podlite>
=para L<Download |file:../download/index.podlite>
=para L<About|file:../about/index.podlite>
=para 🔍 K<⌘K>/K<ctrl-K>
=end React


HEADER

FOOTER

index

use PDF::Tags;

unit class PDF::Tags::Reader:ver<0.0.4>
    is PDF::Tags;

use PDF::Font::Loader::FontObj;
use PDF::Font::Loader;
use PDF::Content::Canvas;
use PDF::Content::Font;
use PDF::Content::FontObj;
use PDF::Content::Ops :GraphicsContext;
use PDF::Content::Matrix :&is-identity;
use PDF::Content::Tag :InlineElemTags;
use PDF::Class;

has Bool $.strict = True;
has Bool $.marks;

method read(PDF::Class:D :$pdf!, Bool :$create, |c --> PDF::Tags:D) {
    with $pdf.catalog.StructTreeRoot -> $cos {
        self.new: :$cos, :root(self.WHAT), |c;
    }
    else {
        $create
            ?? self.create(:$pdf, |c)
            !! fail "PDF document does not contain marked content";
    }
}

class TextDecoder {
    use PDF::Content::Ops :OpCode;
    use Method::Also;
    has Hash @!save;
    has PDF::Content::Font $!font;
    has $.current-font;
    has Numeric $.font-size = 10;
    has PDF::Content::Tag $.mark;
    has Int $!artifact;
    has Numeric $!ty;

    method current-font {
        PDF::Font::Loader.load-font: :dict($!font)
            unless $!font.font-obj ~~ PDF::Content::FontObj:D;
        $!font.font-obj;
    }

    method callback {
        sub ($op, *@args) {
            my $method = OpCode($op).key;
            self."$method"(|@args)
                if self.can($method);
        }
    }
    method BeginMarkedContent($,$?) is also<BeginMarkedContentDict> {
        given $*gfx.tags.open-tags.tail -> $tag {
            $!artifact++ if $tag.name eq Artifact;
            $!mark = $tag if $tag.mcid;
        }
    }
    method EndMarkedContent() {
        with $*gfx.tags.closed-tag -> $tag {
            $!artifact-- if $tag.name eq Artifact;
            with $!mark {
                $_ = Nil if $_ === $tag;
            }
        }
    }
    method Save()      {
        @!save.push: %( :$!font, :$!font-size );
    }
    method Restore()   {
        if @!save {
            given @!save.pop {
                $!font = .<font>;
                $!font-size = .<font-size>;
            }
        }
    }
    method SetFont($, $!font-size) {
        $!font = $_ with $*gfx.font-face;
    }
    method SetGraphicsState($gs) {
        if $gs<Font>:exists {
            $!font = $*gfx.font-face;
            $!font-size = $*gfx.font-size;
        }
    }
    method !save-text($text) {
        with $!mark // $*gfx.open-tags.tail -> $tag {
            given $tag.children {
                if .tail ~~ Str:D {
                    .tail ~= $text;
                }
                else {
                    $tag.children.push: $text;
                }
            }
        }
        else {
            note "untagged text: {$text}";
        }
    }
    method !set-ty { $!ty = .[5] / .[3] given $*gfx.TextMatrix; }
    method ShowText($_) {
        unless $!artifact {
            self!set-ty;
            my $text = $.current-font.decode($_, :str);
            self!save-text: $text;
        }
    }
    method ShowSpaceText(List $_) {
        unless $!artifact {
            self!set-ty;
            my Str $last := ' ';
            my @chunks = .map: {
                when Str {
                    $last := $.current-font.decode($_, :str);
                }
                when $_ <= -120 && !($last ~~ /\s$/) {
                    # assume implicit space
                    ' '
                }
                default { Empty }
            }

            self!save-text: @chunks.join;
        }
    }
    method TextNextLine(|) is also<TextMoveSet MoveShowText MoveSetShowText> {
        # treat these as explict newlines
        unless $!artifact {
            self!save-text: "\n";
        }
    }
    method TextMove($x, $y) {
        # treat a significant vertical shift from the
        # last text positioning as an explict newline
        unless $!artifact {
            my $old-ty = $!ty;
            my $new-ty = self!set-ty;
            with $old-ty {
                my $leading = ($_ - $new-ty) / $!font-size;
                self!save-text: "\n"
                    unless -.3 <= $leading <= .3;
            }
        }
    }
    method Do($key) {
        warn "todo Do $key"
            unless $!artifact;
    }
}
constant Tags = Hash[PDF::Content::Tag];
has Tags %!canvas-tags{PDF::Content::Canvas};

method canvas-tags($obj --> Hash) {
    %!canvas-tags{$obj} //= do {
        $*ERR.print: '.';
        my &callback = TextDecoder.new.callback;
        my $gfx = $obj.gfx: :&callback, :$!strict;
        $obj.render;
        my PDF::Content::Tag % = $gfx.tags.grep(*.mcid.defined).map: {.mcid => $_ };
    }
}



use PDF::Class;
use PDF::Tags::Reader;
# read tags
my PDF::Class $pdf .= open: "t/pdf/tagged.pdf");
my PDF::Tags::Reader $tags .= read: :$pdf;
my PDF::Tags::Elem $doc = $tags[0];
say "document root {$doc.name}";
say " - child {.name}" for $doc.kids;
say $doc.xml; # dump tags and text content as XML


This module implements reading of tagged PDF content from PDF files.


This class inherits from L<PDF::Tags|https://pdf-raku.github.io/PDF-Tags-raku/> and has its methods available.


method read(PDF::Class :$pdf!, Bool :$create, Bool :$marks) returns PDF::Tags


Read tagged PDF structure from an existing file that has been previously tagged.


The `:create` option creates a new struct-tree root, if one does not already exist.


The `:marks` option causes PDF::Tag::Reader to descend into content and build a more
detailed structure that includes the actual marks in the content stream as L<PDF::Tags::Mark>
objects. Otherwise just the content text is inserted as a child of type `Str`.


method canvas-tags(PDF::Content::Canvas) returns Hash


Renders a canvas object (Page or XObject form) and caches
marked content as a hash of L<PDF::Content::Tag> objects,
indexed by `MCID` (Marked Content ID).


This script reads tagged PDF content from PDF files as XML.


Reader

#!/usr/bin/env perl6
use v6;

use PDF::Class;
use PDF::Catalog;
use PDF::StructTreeRoot;
use PDF::Tags::Reader;
use PDF::Tags::XML-Writer;
use PDF::Tags::Node :TagName;
use PDF::IO;

subset Number of Int where * > 0;

sub MAIN(Str $infile,               #= input PDF
	 Str     :$password = '',   #= password for the input PDF, if encrypted
         Number  :$max-depth = 16,  #= depth to ascend/descend struct tree
         Bool    :$atts = True,     #= include attributes in tags
         Bool    :$debug,           #= write extra debugging information
         Bool    :$marks,           #= descend into marked content
         Bool    :$strict = True,   #= warn about unknown tags, etc
         Bool    :$style = True,    #= include stylesheet header
         Str     :$select,          #= XPath of twigs to include (relative to root)
         TagName :$omit,            #= Tags to omit from output
         TagName :$root-tag,        #= Outer root tag name
        ) {

    my PDF::IO $input .= coerce(
       $infile eq '-'
           ?? $*IN.slurp-rest( :bin ) # sequential access
           !! $infile.IO              # random access
    );

    my PDF::Class $pdf .= open( $input, :$password );
    my PDF::Tags::Reader $dom .= read: :$pdf, :$strict, :$marks;
    my PDF::Tags::XML-Writer $xml .= new: :$max-depth, :$atts, :$debug, :$omit, :$style, :$root-tag, :$marks;

    my PDF::Tags::Node @nodes = do with $select {
        $dom.find($_);
    }
    else {
        $dom.root;
    }

    my UInt $depth = 0;

    with $root-tag {
        unless @nodes[0] ~~ PDF::Tags:D {
            say '<' ~ $_ ~ '>';
            $depth++;
        }
    }

    $xml.say($*OUT, $_, :$depth) for @nodes;

    say '</' ~ $root-tag ~ '>' if $depth;
}



Options:
   --password          password for an encrypted PDF
   --max-depth=n       maximum tag-depth to descend
   --select=XPath      nodes to be included
   --omit=tag-name     nodes to be excluded
   --root-tag=tag-name define outer root tag
   --marks             decend into marked content
   --debug             add debugging to output
   --/atts             omit attributes in tags
   --/strict           suppress warnings
   --/style            omit root stylesheet link


Dumps structure elements from a tagged PDF.


Produces tagged output in an XML format.


Only some PDF files contain tagged PDF. pdf-info.raku can be
used to check this:


% pdf-info.raku my-doc.pdf | grep Tagged:
Tagged:     yes


This script requires the freetype6 native library and the PDF::Font::Loader
Raku module to be installed on your system.


pdf-tag-dump

The Raku Knowledge Base


=begin React :component<HeaderCol> :id<footer>  
=begin nested :!nested
=item1 B<Language>
    =item2 L<Get started|file:../getting-started/getting-started.podlite>
    =item2 L<Why Raku?|/doc/language/faq#Why-should-I-learn-Raku-What's-so-great-about-it>
    =item2 L<Try Raku|https://glot.io/new/raku>
    =item2 L<Raku cheat sheet|https://github.com/Raku/mu/blob/master/docs/Perl6/Cheatsheet/cheatsheet.txt>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<Raku on Exercism|https://exercism.org/tracks/raku>
    =item2 L<Wikipedia|https://en.wikipedia.org/wiki/Raku_(programming_language)>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=begin nested :!nested

=item1 B<L<Documentation|/doc>>
    =item2 L<Getting started, Migration guides from other languages, & Tutorials | file:../doc/introduction.podlite>
    =item2 L<Language References|file:../doc/reference.podlite>
    =item2 L<Type Reference| file:../doc/types.podlite>
    =item2 L<Miscellaneous| file:../doc/miscellaneous.podlite>
    =item2 L<FAQs (Frequently Asked Questions)|/doc/language/faq>
    =item2 L<Community|/doc/language/community>
    =item2 L<The list of all documents|file:../doc/index.podlite>

=item1 B<Resources>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>

=end nested
=begin nested :!nested

=item1 B<Resoures>
    =item2 L<Download | file:../download/index.podlite>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=item1 B<Learning>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Wikibook |https://en.wikibooks.org/wiki/Raku_Programming>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>
    =item2 L<Learn Raku in Y minutes|https://learnxinyminutes.com/docs/raku/>
    =item2 L<Golfing |https://github.com/AlexDaniel/raku-golf-cheatsheet>
=end nested
=begin nested :!nested

=item1 B<Explore>
    =item2 L<Raku Blog Aggregator|https://planet.raku.org/>
    =item2 L<Rakudo Weekly|https://rakudoweekly.blog/>
    =item2 L<The Weekly Challenge |https://perlweeklychallenge.org/>
    =item2 L<Raku Advent Calendar|https://raku-advent.blog/>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=end React


=begin React :component<CookieConsent> :id<CookieConsent> :buttonCaption("Got it!")
=para
This website uses cookies for analytics.
=end React


index

Synopsis

Description

Methods

method read

method canvas-tags

Scripts in this Distribution

`pdf-tag-dump.raku`

PDF::Tags::Reader v0.0.4

Authors

License

Dependencies

Test Dependencies

Provides

Documentation