head

code

item

root

para

L<Examples| file:../examples/index.podlite>


L<Download |file:../download/index.podlite>


React

=begin React :component<HeaderCol> :id<menu> 
=para L<Documentation|/doc/introduction>
=para L<Modules| /mods>
=para L<Examples| file:../examples/index.podlite>
=para L<Download |file:../download/index.podlite>
=para L<About|file:../about/index.podlite>
=para 🔍 K<⌘K>/K<ctrl-K>
=end React


HEADER

FOOTER

README

table_cell

table_row

table

index

use PDF::Tags;

unit class PDF::Tags::Reader:ver<0.0.4>
    is PDF::Tags;

use PDF::Font::Loader::FontObj;
use PDF::Font::Loader;
use PDF::Content::Canvas;
use PDF::Content::Font;
use PDF::Content::FontObj;
use PDF::Content::Ops :GraphicsContext;
use PDF::Content::Matrix :&is-identity;
use PDF::Content::Tag :InlineElemTags;
use PDF::Class;

has Bool $.strict = True;
has Bool $.marks;

method read(PDF::Class:D :$pdf!, Bool :$create, |c --> PDF::Tags:D) {
    with $pdf.catalog.StructTreeRoot -> $cos {
        self.new: :$cos, :root(self.WHAT), |c;
    }
    else {
        $create
            ?? self.create(:$pdf, |c)
            !! fail "PDF document does not contain marked content";
    }
}

class TextDecoder {
    use PDF::Content::Ops :OpCode;
    use Method::Also;
    has Hash @!save;
    has PDF::Content::Font $!font;
    has $.current-font;
    has Numeric $.font-size = 10;
    has PDF::Content::Tag $.mark;
    has Int $!artifact;
    has Numeric $!ty;

    method current-font {
        PDF::Font::Loader.load-font: :dict($!font)
            unless $!font.font-obj ~~ PDF::Content::FontObj:D;
        $!font.font-obj;
    }

    method callback {
        sub ($op, *@args) {
            my $method = OpCode($op).key;
            self."$method"(|@args)
                if self.can($method);
        }
    }
    method BeginMarkedContent($,$?) is also<BeginMarkedContentDict> {
        given $*gfx.tags.open-tags.tail -> $tag {
            $!artifact++ if $tag.name eq Artifact;
            $!mark = $tag if $tag.mcid;
        }
    }
    method EndMarkedContent() {
        with $*gfx.tags.closed-tag -> $tag {
            $!artifact-- if $tag.name eq Artifact;
            with $!mark {
                $_ = Nil if $_ === $tag;
            }
        }
    }
    method Save()      {
        @!save.push: %( :$!font, :$!font-size );
    }
    method Restore()   {
        if @!save {
            given @!save.pop {
                $!font = .<font>;
                $!font-size = .<font-size>;
            }
        }
    }
    method SetFont($, $!font-size) {
        $!font = $_ with $*gfx.font-face;
    }
    method SetGraphicsState($gs) {
        if $gs<Font>:exists {
            $!font = $*gfx.font-face;
            $!font-size = $*gfx.font-size;
        }
    }
    method !save-text($text) {
        with $!mark // $*gfx.open-tags.tail -> $tag {
            given $tag.children {
                if .tail ~~ Str:D {
                    .tail ~= $text;
                }
                else {
                    $tag.children.push: $text;
                }
            }
        }
        else {
            note "untagged text: {$text}";
        }
    }
    method !set-ty { $!ty = .[5] / .[3] given $*gfx.TextMatrix; }
    method ShowText($_) {
        unless $!artifact {
            self!set-ty;
            my $text = $.current-font.decode($_, :str);
            self!save-text: $text;
        }
    }
    method ShowSpaceText(List $_) {
        unless $!artifact {
            self!set-ty;
            my Str $last := ' ';
            my @chunks = .map: {
                when Str {
                    $last := $.current-font.decode($_, :str);
                }
                when $_ <= -120 && !($last ~~ /\s$/) {
                    # assume implicit space
                    ' '
                }
                default { Empty }
            }

            self!save-text: @chunks.join;
        }
    }
    method TextNextLine(|) is also<TextMoveSet MoveShowText MoveSetShowText> {
        # treat these as explict newlines
        unless $!artifact {
            self!save-text: "\n";
        }
    }
    method TextMove($x, $y) {
        # treat a significant vertical shift from the
        # last text positioning as an explict newline
        unless $!artifact {
            my $old-ty = $!ty;
            my $new-ty = self!set-ty;
            with $old-ty {
                my $leading = ($_ - $new-ty) / $!font-size;
                self!save-text: "\n"
                    unless -.3 <= $leading <= .3;
            }
        }
    }
    method Do($key) {
        warn "todo Do $key"
            unless $!artifact;
    }
}
constant Tags = Hash[PDF::Content::Tag];
has Tags %!canvas-tags{PDF::Content::Canvas};

method canvas-tags($obj --> Hash) {
    %!canvas-tags{$obj} //= do {
        $*ERR.print: '.';
        my &callback = TextDecoder.new.callback;
        my $gfx = $obj.gfx: :&callback, :$!strict;
        $obj.render;
        my PDF::Content::Tag % = $gfx.tags.grep(*.mcid.defined).map: {.mcid => $_ };
    }
}



use PDF::Class;
use PDF::Tags::Reader;
# read tags
my PDF::Class $pdf .= open: "t/pdf/tagged.pdf");
my PDF::Tags::Reader $tags .= read: :$pdf;
my PDF::Tags::Elem $doc = $tags[0];
say "document root {$doc.name}";
say " - child {.name}" for $doc.kids;
say $doc.xml; # dump tags and text content as XML


This module implements reading of tagged PDF content from PDF files.


This class inherits from L<PDF::Tags|https://pdf-raku.github.io/PDF-Tags-raku/> and has its methods available.


method read(PDF::Class :$pdf!, Bool :$create, Bool :$marks) returns PDF::Tags


Read tagged PDF structure from an existing file that has been previously tagged.


The `:create` option creates a new struct-tree root, if one does not already exist.


The `:marks` option causes PDF::Tag::Reader to descend into content and build a more
detailed structure that includes the actual marks in the content stream as L<PDF::Tags::Mark>
objects. Otherwise just the content text is inserted as a child of type `Str`.


method canvas-tags(PDF::Content::Canvas) returns Hash


Renders a canvas object (Page or XObject form) and caches
marked content as a hash of L<PDF::Content::Tag> objects,
indexed by `MCID` (Marked Content ID).


This script reads tagged PDF content from PDF files as XML.


Reader

The Raku Knowledge Base


=begin React :component<HeaderCol> :id<footer>  
=begin nested :!nested
=item1 B<Language>
    =item2 L<Get started|file:../getting-started/getting-started.podlite>
    =item2 L<Why Raku?|/doc/language/faq#Why-should-I-learn-Raku-What's-so-great-about-it>
    =item2 L<Try Raku|https://glot.io/new/raku>
    =item2 L<Raku cheat sheet|https://github.com/Raku/mu/blob/master/docs/Perl6/Cheatsheet/cheatsheet.txt>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<Raku on Exercism|https://exercism.org/tracks/raku>
    =item2 L<Wikipedia|https://en.wikipedia.org/wiki/Raku_(programming_language)>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=begin nested :!nested

=item1 B<L<Documentation|/doc>>
    =item2 L<Getting started, Migration guides from other languages, & Tutorials | file:../doc/introduction.podlite>
    =item2 L<Language References|file:../doc/reference.podlite>
    =item2 L<Type Reference| file:../doc/types.podlite>
    =item2 L<Miscellaneous| file:../doc/miscellaneous.podlite>
    =item2 L<FAQs (Frequently Asked Questions)|/doc/language/faq>
    =item2 L<Community|/doc/language/community>
    =item2 L<The list of all documents|file:../doc/index.podlite>

=item1 B<Resources>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>

=end nested
=begin nested :!nested

=item1 B<Resoures>
    =item2 L<Download | file:../download/index.podlite>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=item1 B<Learning>
    =item2 L<The Raku Guide|https://raku.guide/>
    =item2 L<Wikibook |https://en.wikibooks.org/wiki/Raku_Programming>
    =item2 L<Books |https://perl6book.com/>
    =item2 L<Rosetta Code | https://www.raku.org/community/rosettacode>
    =item2 L<Learn Raku in Y minutes|https://learnxinyminutes.com/docs/raku/>
    =item2 L<Golfing |https://github.com/AlexDaniel/raku-golf-cheatsheet>
=end nested
=begin nested :!nested

=item1 B<Explore>
    =item2 L<Raku Blog Aggregator|https://planet.raku.org/>
    =item2 L<Rakudo Weekly|https://rakudoweekly.blog/>
    =item2 L<The Weekly Challenge |https://perlweeklychallenge.org/>
    =item2 L<Raku Advent Calendar|https://raku-advent.blog/>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>
    =item2 L<empty|#>

=end nested
=end React


=begin React :component<CookieConsent> :id<CookieConsent> :buttonCaption("Got it!")
=para
This website uses cookies for analytics.
=end React


README

Synopsis

Description

Methods

method read

method canvas-tags

Scripts in this Distribution

`pdf-tag-dump.raku`

PDF::Tags::Reader v0.0.4

Authors

License

Dependencies

Test Dependencies

Provides

Documentation