docbook-fixup.pl 5.71 KB
Newer Older
1
#!/usr/bin/perl -w
2

3 4 5
# Fix the output of `makeinfo --docbook` version 4.0c
# Convert the broken docbook output to well-formed XML that conforms to the O'Reilly idiom
# See code for detailed comments
6
# Authors: Arjen Lentz and Zak Greant (original code by Jeremy Cole)
7

8
use strict;
9

10 11 12 13 14
my $data  = '';
my @apx   = ();
my $apx   = '';
my @nodes = ();
my $nodes = '';
15

16
msg ("-- Post-processing `makeinfo --docbook` output --");
17
msg ("** Written to work with makeinfo version 4.0c **\n");
18

19 20 21
msg ("Discarding DTD - not required by subsequent scripts");
# <> is a magic filehandle - either reading lines from stdin or from file(s) specified on the command line
<>;
22

23 24
msg ("Create an XML PI with ISO-8859-1 character encoding");
$data = "<?xml version='1.0' encoding='ISO-8859-1'?>";
25

26 27
msg ("Get the rest of the data");
$data = $data . join "", <>;
28

29 30 31 32
msg ("Add missing <bookinfo> and <abstract> opening tags");
# Note the absence of the g (global) pattern modified. This situation can only happen once.
# ...as soon as we find the first instance, we can stop looking.
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/;
33

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

# arjen 2002-05-01
msg ("Processing docbook-prefix special strings");
$data =~ s/FIXUPmdashFIXUP/\&mdash\;/g;

$data =~ s/FIXUPdoubledashFIXUP/--/g;

$data =~ s/FIXUPstrongFIXUP/<emphasis\ role\=bold>/g;
$data =~ s/FIXUPendstrongFIXUP/<\/emphasis>/g;

$data =~ s/FIXUPemphFIXUP/<emphasis>/g;
$data =~ s/FIXUPendemphFIXUP/<\/emphasis>/g;

$data =~ s/FIXUPfileFIXUP/<filename>/g;
$data =~ s/FIXUPendfileFIXUP/<\/filename>/g;

$data =~ s/FIXUPsampFIXUP/<literal>/g;
$data =~ s/FIXUPendsampFIXUP/<\/literal>/g;


54 55 56 57 58 59 60
msg ("Removing mailto: from email addresses...");
$data =~ s/mailto://g;

msg ("Removing INFORMALFIGURE...");
$data =~ s{<informalfigure>.+?</informalfigure>}
          {}gs;

61
msg ("Convert ampersand to XML escape sequence...");
62
$data =~ s/&(?!\w+;)/&amp;/g;
63 64 65 66 67 68 69 70 71 72 73

# arjen 2002-05-01
msg ("Changing (TM) to XML escape sequence...");
$data =~ s/MySQL \(TM\)/MySQL&trade;/g;
$data =~ s{<command>TM</command>}
          {&trade;}g;

# arjen 2002-05-01
msg ("Changing ' -- ' to XML escape sequence...");
$data =~ s/ -- /&mdash;/g;

74 75
msg ("Changing @@ to @...");
$data =~ s/@@/@/g;
76

77 78 79
msg ("Rework references of the notation '<n>'");
# Need to talk to Arjen about what the <n> bits are for
$data =~ s/<(\d)>/[$1]/g;
80

81 82 83
msg ("Changing '_' to '-' in references...");
$data =~ s{((?:id|linkend)=\".+?\")}
          {&underscore2hyphen($1)}gex;
84

85 86
msg ("Changing ULINK to SYSTEMITEM...");
$data =~ s{<ulink url=\"(.+?)\">\s*</ulink>}
87 88
          {<systemitem role=\"url\">$1</systemitem>}gs;

89
msg ("Adding PARA inside ENTRY...");
90
$data =~ s{<entry>(.*?)</entry>}
91 92
          {<entry><para>$1</para></entry>}gs;

93 94 95
msg ("Fixing spacing problem with titles...");
$data =~ s{(</\w+>)(\w{2,})}
          {$1 $2}gs;
unknown's avatar
unknown committed
96

97 98 99
msg ("Adding closing / to XREF and COLSPEC tags...");
$data =~ s{<(xref|colspec) (.+?)>}
          {<$1 $2 />}gs;
unknown's avatar
unknown committed
100

101 102 103 104 105
# arjen 2002-04-26
msg ("Removing separate target titles from LINKs and make them XREFs...");
$data =~ s{<link (linkend=.+?)>.+?</link>}
          {<xref $1 />}gs;

106 107 108
# Probably need to strip these
msg ('Adding "See " to XREFs that used to be @xref...');
$data =~ s{([.'!)])\s*<xref }
109 110
          {$1 See <xref }gs;

111 112
msg ('Adding "see " to (XREFs) that used to be (@pxref)...');
$data =~ s{([([,;])(\s*)<xref }
113 114
          {$1$2see <xref }gs;

115 116 117
msg ("Making first row in table THEAD...");
$data =~ s{( *)<tbody>(\s*<row>.+?</row>)}
          {$1<thead>$2\n$1</thead>\n$1<tbody>}gs;
118

119
msg ("Removing EMPHASIS inside THEAD...");
120
$data =~ s{<thead>(.+?)</thead>}
121
          {"<thead>".&strip_tag($1, 'emphasis')."</thead>"}gsex;
122

123 124 125
msg ("Removing empty PARA...");
$data =~ s{<para>\s*</para>}
          {}gs;
126

127 128 129
msg ("Removing lf before /PARA in ENTRY...");
$data =~ s{\n(</para></entry>)}
          {$1}gs;
130

131 132 133
msg ("Removing whitespace before /PARA if not on separate line...");
$data =~ s{(\S+)[\t ]+</para>}
          {$1</para>}g;
134

135
msg ("Removing PARA around INDEXTERM if no text in PARA...");
136
$data =~ s{<para>((?:<indexterm role=\"[^"]+\">(?:<(primary|secondary)>[^>]+</\2>)+?</indexterm>)+?)\s*</para>}
137
          {$1}gs;
138

139
@apx = ("Users", "MySQL Testimonials", "News", "GPL-license", "LGPL-license");
140 141

foreach $apx (@apx) {
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
    msg ("Removing appendix $apx...");
    $data =~ s{<appendix id=\"$apx\">(.+?)</appendix>}
              {}gs;

    # Skip to next appendix regex if the regex did not match anything
    next unless (defined $&);
    
    msg ("...Building list of removed nodes...");
    
    # Split the last bracketed regex match into an array
    # Extract the node names from the tags and push them into an array
    foreach (split "\n", $&) {
        push @nodes, $1 if /<\w+ id=\"(.+?)\">/
    }
}
unknown's avatar
unknown committed
157

158
# 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match)
159 160 161 162 163 164 165
msg ("Fixing references to removed nodes...");
# Merge the list of node names into a set of regex alternations
$nodes = join "|", @nodes;

# Find all references to removed nodes and convert them to absolute URLs
$data =~ s{<\w+ linkend="($nodes)" />}
          {&xref2link($1)}ges;
166 167

print STDOUT $data;
168 169 170 171 172 173 174
exit;

#
# Definitions for helper sub-routines
#

sub msg {
175
    print STDERR "docbook-fixup:", shift, "\n";
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
}

sub strip_tag($$) {
    (my $str, my $tag) = @_;
    $str =~ s{<$tag>(.+?)</$tag>}{$1}gs;
    return $str;
}

sub underscore2hyphen($) {
    my $str = shift;
    $str =~ tr/_/-/;
    return $str;
}

sub xref2link {
    my $ref = shift;
    $ref =~ tr/ /_/;
    $ref =~ s{^((.)(.).+)$}{$2/$3/$1.html};
    return "http://www.mysql.com/doc/" . $ref;
}

# We might need to encode the high-bit characters to ensure proper representation
# msg ("Converting high-bit characters to entities");
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)