wok-current annotate html2text/stuff/patch-utf8-html2text-1.3.2a.diff @ rev 23069

scilab: update for hdf5
author Pascal Bellard <pascal.bellard@slitaz.org>
date Tue Mar 10 08:35:31 2020 +0100 (2020-03-10)
parents
children
rev   line source
al@18840 1 diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C
al@18840 2 --- html2text-1.3.2a/Area.C 2003-11-23 12:05:29.000000000 +0100
al@18840 3 +++ html2text-1.3.2a-patched/Area.C 2005-05-13 22:19:59.862137688 +0200
al@18840 4 @@ -36,10 +36,13 @@
al@18840 5 #include <iostream>
al@18840 6
al@18840 7 #include "Area.h"
al@18840 8 +#include "html.h"
al@18840 9 #include "string.h"
al@18840 10
al@18840 11 #define LATIN1_nbsp 160
al@18840 12
al@18840 13 +extern int use_encoding;
al@18840 14 +
al@18840 15 /* ------------------------------------------------------------------------- */
al@18840 16
al@18840 17 #define malloc_array(type, size)\
al@18840 18 @@ -81,6 +84,27 @@
al@18840 19
al@18840 20 /* ------------------------------------------------------------------------- */
al@18840 21
al@18840 22 +/* utf_length() and utf_width()
al@18840 23 + *
al@18840 24 + * Very simplified algorithm of calculating length of UTF-8
al@18840 25 + * string. No check for errors. Counting only ASCII bytes and
al@18840 26 + * leading bytes of UTF-8 multibyte sequences. All bytes like
al@18840 27 + * 10xxxxxx are dropped. If USE_UTF8 is false then returns
al@18840 28 + * usual length. --YS
al@18840 29 + */
al@18840 30 +
al@18840 31 +unsigned int
al@18840 32 +Line::utf_length(size_type f, size_type t) const
al@18840 33 +{
al@18840 34 + size_type m = (t < length_ ? t : length_);
al@18840 35 + size_type r = m - f;
al@18840 36 + if(USE_UTF8) {
al@18840 37 + for (int i = f; i < m; i++)
al@18840 38 + if((cells_[i].character & 0xc0) == 0x80) r--;
al@18840 39 + }
al@18840 40 + return r;
al@18840 41 +}
al@18840 42 +
al@18840 43 void
al@18840 44 Line::resize(size_type l)
al@18840 45 {
al@18840 46 @@ -236,6 +260,23 @@
al@18840 47 return *this;
al@18840 48 }
al@18840 49
al@18840 50 +unsigned int
al@18840 51 +Area::utf_width()
al@18840 52 +{
al@18840 53 + size_type r = width_;
al@18840 54 + if(USE_UTF8) { r = 0;
al@18840 55 + for (size_type yy = 0; yy < height_; yy++) {
al@18840 56 + size_type r1 = 0;
al@18840 57 + for (int i = width_ - 1; i >= 0; i--) {
al@18840 58 + if(!r1 && isspace(cells_[yy][i].character)) continue;
al@18840 59 + if((cells_[yy][i].character & 0xc0) != 0x80) r1++;
al@18840 60 + }
al@18840 61 + if(r < r1) r = r1;
al@18840 62 + }
al@18840 63 + }
al@18840 64 + return r;
al@18840 65 +}
al@18840 66 +
al@18840 67 void
al@18840 68 Area::resize(size_type w, size_type h)
al@18840 69 {
al@18840 70 @@ -439,7 +480,7 @@
al@18840 71 char c = p->character;
al@18840 72 char a = p->attribute;
al@18840 73
al@18840 74 - if (c == (char) LATIN1_nbsp) c = ' ';
al@18840 75 + if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';
al@18840 76
al@18840 77 if (a == Cell::NONE) {
al@18840 78 os << c;
al@18840 79 Nur in html2text-1.3.2a-patched/: Area.C.orig.
al@18840 80 diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h
al@18840 81 --- html2text-1.3.2a/Area.h 2003-11-23 12:05:29.000000000 +0100
al@18840 82 +++ html2text-1.3.2a-patched/Area.h 2005-05-13 22:19:59.863137536 +0200
al@18840 83 @@ -81,6 +81,8 @@
al@18840 84 Cell &operator[](size_type x) { return cells_[x]; }
al@18840 85 const Cell *cells() const { return cells_; }
al@18840 86
al@18840 87 + unsigned int utf_length(size_type f, size_type t) const;
al@18840 88 +
al@18840 89 void resize(size_type l);
al@18840 90 void enlarge(size_type l) { if (l > length_) resize(l); }
al@18840 91
al@18840 92 @@ -134,6 +136,8 @@
al@18840 93 Cell *operator[](size_type y) { return cells_[y]; }
al@18840 94 const Area &operator>>=(size_type rs);
al@18840 95
al@18840 96 + unsigned int utf_width();
al@18840 97 +
al@18840 98 void resize(size_type w, size_type h);
al@18840 99 void enlarge(size_type w, size_type h);
al@18840 100
al@18840 101 Nur in html2text-1.3.2a-patched/: Area.h.orig.
al@18840 102 diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C
al@18840 103 --- html2text-1.3.2a/format.C 2003-11-23 12:05:29.000000000 +0100
al@18840 104 +++ html2text-1.3.2a-patched/format.C 2005-05-13 22:19:59.865137232 +0200
al@18840 105 @@ -1210,6 +1210,7 @@
al@18840 106 }
al@18840 107
al@18840 108 Line::size_type to = from + 1;
al@18840 109 + int to_from;
al@18840 110
al@18840 111 Line::size_type lbp = (Line::size_type) -1; // "Last break position".
al@18840 112
al@18840 113 @@ -1238,18 +1239,20 @@
al@18840 114 to++;
al@18840 115 }
al@18840 116
al@18840 117 - if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }
al@18840 118 + if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1)
al@18840 119 + { to = lbp; break; }
al@18840 120 }
al@18840 121
al@18840 122 + to_from = line.utf_length(from,to);
al@18840 123 /*
al@18840 124 * Copy the "from...to" range from the "line" to the bottom of the "res"
al@18840 125 * Area.
al@18840 126 */
al@18840 127 Area::size_type x = 0;
al@18840 128 Area::size_type len = to - from;
al@18840 129 - if (halign == Area::LEFT || len >= w) { ; } else
al@18840 130 - if (halign == Area::CENTER) { x += (w - len) / 2; } else
al@18840 131 - if (halign == Area::RIGHT) { x += w - len; }
al@18840 132 + if (halign == Area::LEFT || to_from >= w) { ; } else
al@18840 133 + if (halign == Area::CENTER) { x += (w - to_from) / 2; } else
al@18840 134 + if (halign == Area::RIGHT) { x += w - to_from; }
al@18840 135 res->insert(line.cells() + from, len, x, res->height());
al@18840 136
al@18840 137 /*
al@18840 138 Nur in html2text-1.3.2a-patched/: format.C.orig.
al@18840 139 diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C
al@18840 140 --- html2text-1.3.2a/html2text.C 2003-11-23 12:05:29.000000000 +0100
al@18840 141 +++ html2text-1.3.2a-patched/html2text.C 2005-05-13 22:19:59.868136776 +0200
al@18840 142 @@ -148,9 +148,10 @@
al@18840 143 -o <file> Redirect output into <file>\n\
al@18840 144 -nobs Do not use backspaces for boldface and underlining\n\
al@18840 145 -ascii Use plain ASCII for output instead of ISO-8859-1\n\
al@18840 146 + -utf8 Assume both terminal and input stream are in UTF-8 mode\n\
al@18840 147 ";
al@18840 148
al@18840 149 -int use_iso8859 = 1;
al@18840 150 +int use_encoding = ISO8859;
al@18840 151
al@18840 152 int
al@18840 153 main(int argc, char **argv)
al@18840 154 @@ -199,7 +200,8 @@
al@18840 155 if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else
al@18840 156 if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else
al@18840 157 if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else
al@18840 158 - if (!strcmp(arg, "-ascii" )) { use_iso8859 = false; } else
al@18840 159 + if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else
al@18840 160 + if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else
al@18840 161 {
al@18840 162 std::cerr
al@18840 163 << "Unrecognized command line option \""
al@18840 164 Nur in html2text-1.3.2a-patched/: html2text.C.orig.
al@18840 165 diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h
al@18840 166 --- html2text-1.3.2a/html.h 2001-10-04 22:03:54.000000000 +0200
al@18840 167 +++ html2text-1.3.2a-patched/html.h 2005-05-13 22:19:59.866137080 +0200
al@18840 168 @@ -61,6 +61,11 @@
al@18840 169
al@18840 170 /* ------------------------------------------------------------------------- */
al@18840 171
al@18840 172 +enum {ASCII, ISO8859, UTF8};
al@18840 173 +#define USE_ISO8859 (use_encoding == ISO8859)
al@18840 174 +#define USE_ASCII (use_encoding == ASCII)
al@18840 175 +#define USE_UTF8 (use_encoding == UTF8)
al@18840 176 +
al@18840 177 #define LATIN1_nbsp 160
al@18840 178 #define LATIN1_iexcl 161
al@18840 179 #define LATIN1_cent 162
al@18840 180 diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C
al@18840 181 --- html2text-1.3.2a/sgml.C 2003-11-23 12:09:11.000000000 +0100
al@18840 182 +++ html2text-1.3.2a-patched/sgml.C 2005-05-13 22:19:59.870136472 +0200
al@18840 183 @@ -62,261 +62,280 @@
al@18840 184 char name[8];
al@18840 185 int iso8859code;
al@18840 186 char *asciistr;
al@18840 187 + unsigned long unicode;
al@18840 188 } entities[] = {
al@18840 189 - { "AElig", LATIN1_AElig, "AE" },
al@18840 190 - { "AMP", 0, "&" },
al@18840 191 - { "Aacute", LATIN1_Aacute, "A'" },
al@18840 192 - { "Acirc", LATIN1_Acirc, "A^" },
al@18840 193 - { "Agrave", LATIN1_Agrave, "A`" },
al@18840 194 - { "Alpha", 0, "A" },
al@18840 195 - { "Aring", LATIN1_Aring, "AA" },
al@18840 196 - { "Atilde", LATIN1_Atilde, "A~" },
al@18840 197 - { "Auml", LATIN1_Auml, "A\"" },
al@18840 198 - { "Beta", 0, "B" },
al@18840 199 - { "Ccedil", LATIN1_Ccedil, "C," },
al@18840 200 - { "Chi", 0, "H" },
al@18840 201 - { "Dagger", 0, "++" },
al@18840 202 - { "Delta", 0, "D" },
al@18840 203 - { "ETH", LATIN1_ETH, "D-" },
al@18840 204 - { "Eacute", LATIN1_Eacute, "E'" },
al@18840 205 - { "Ecirc", LATIN1_Ecirc, "E^" },
al@18840 206 - { "Egrave", LATIN1_Egrave, "E`" },
al@18840 207 - { "Epsilon", 0, "E" },
al@18840 208 - { "Eta", 0, "E" },
al@18840 209 - { "Euml", LATIN1_Euml, "E\"" },
al@18840 210 - { "GT", 0, ">" },
al@18840 211 - { "Gamma", 0, "G" },
al@18840 212 - { "Iacute", LATIN1_Iacute, "I'" },
al@18840 213 - { "Icirc", LATIN1_Icirc, "I^" },
al@18840 214 - { "Igrave", LATIN1_Igrave, "I`" },
al@18840 215 - { "Iota", 0, "I" },
al@18840 216 - { "Iuml", LATIN1_Iuml, "I\"" },
al@18840 217 - { "Kappa", 0, "K" },
al@18840 218 - { "LT", 0, "<" },
al@18840 219 - { "Lambda", 0, "L" },
al@18840 220 - { "Mu", 0, "M" },
al@18840 221 - { "Ntilde", LATIN1_Ntilde, "N~" },
al@18840 222 - { "Nu", 0, "N" },
al@18840 223 - { "OElig", 0, "OE" },
al@18840 224 - { "Oacute", LATIN1_Oacute, "O'" },
al@18840 225 - { "Ocirc", LATIN1_Ocirc, "O^" },
al@18840 226 - { "Ograve", LATIN1_Ograve, "O`" },
al@18840 227 - { "Omega", 0, "O" },
al@18840 228 - { "Omicron", 0, "O" },
al@18840 229 - { "Oslash", LATIN1_Oslash, "O/" },
al@18840 230 - { "Otilde", LATIN1_Otilde, "O~" },
al@18840 231 - { "Ouml", LATIN1_Ouml, "O\"" },
al@18840 232 - { "Phi", 0, "F" },
al@18840 233 - { "Pi", 0, "P" },
al@18840 234 - { "Prime", 0, "''" },
al@18840 235 - { "Psi", 0, "PS" },
al@18840 236 - { "QUOT", 0, "\"" },
al@18840 237 - { "Rho", 0, "R" },
al@18840 238 - { "Scaron", 0, "S" },
al@18840 239 - { "Sigma", 0, "S" },
al@18840 240 - { "THORN", LATIN1_THORN, "TH" },
al@18840 241 - { "Tau", 0, "T" },
al@18840 242 - { "Theta", 0, "TH" },
al@18840 243 - { "Uacute", LATIN1_Uacute, "U'" },
al@18840 244 - { "Ucirc", LATIN1_Ucirc, "U^" },
al@18840 245 - { "Ugrave", LATIN1_Ugrave, "U`" },
al@18840 246 - { "Upsilon", 0, "U" },
al@18840 247 - { "Uuml", LATIN1_Uuml, "U\"" },
al@18840 248 - { "Xi", 0, "X" },
al@18840 249 - { "Yacute", LATIN1_Yacute, "Y'" },
al@18840 250 - { "Yuml", 0, "Y\"" },
al@18840 251 - { "Zeta", 0, "Z" },
al@18840 252 - { "aacute", LATIN1_aacute, "a'" },
al@18840 253 - { "acirc", LATIN1_acirc, "a^" },
al@18840 254 - { "acute", LATIN1_acute, "'" },
al@18840 255 - { "aelig", LATIN1_aelig, "ae" },
al@18840 256 - { "agrave", LATIN1_agrave, "a`" },
al@18840 257 + { "AElig", LATIN1_AElig, "AE", 0x00c6},
al@18840 258 + { "AMP", 0, "&", 0x0026},
al@18840 259 + { "Aacute", LATIN1_Aacute, "A'", 0x00c1},
al@18840 260 + { "Acirc", LATIN1_Acirc, "A^", 0x00c2},
al@18840 261 + { "Agrave", LATIN1_Agrave, "A`", 0x00c0},
al@18840 262 + { "Alpha", 0, "A", 0x0391},
al@18840 263 + { "Aring", LATIN1_Aring, "AA", 0x00c5},
al@18840 264 + { "Atilde", LATIN1_Atilde, "A~", 0x00c3},
al@18840 265 + { "Auml", LATIN1_Auml, "A\"", 0x00c4},
al@18840 266 + { "Beta", 0, "B", 0x0392},
al@18840 267 + { "Ccedil", LATIN1_Ccedil, "C,", 0x00c7},
al@18840 268 + { "Chi", 0, "H", 0x03a7},
al@18840 269 + { "Dagger", 0, "++", 0x2020},
al@18840 270 + { "Delta", 0, "D", 0x0394},
al@18840 271 + { "ETH", LATIN1_ETH, "D-", 0x00d0},
al@18840 272 + { "Eacute", LATIN1_Eacute, "E'", 0x00c9},
al@18840 273 + { "Ecirc", LATIN1_Ecirc, "E^", 0x00ca},
al@18840 274 + { "Egrave", LATIN1_Egrave, "E`", 0x00c8},
al@18840 275 + { "Epsilon", 0, "E", 0x0395},
al@18840 276 + { "Eta", 0, "E", 0x0397},
al@18840 277 + { "Euml", LATIN1_Euml, "E\"", 0x00cb},
al@18840 278 + { "GT", 0, ">", 0x003e},
al@18840 279 + { "Gamma", 0, "G", 0x0393},
al@18840 280 + { "Iacute", LATIN1_Iacute, "I'", 0x00cd},
al@18840 281 + { "Icirc", LATIN1_Icirc, "I^", 0x00ce},
al@18840 282 + { "Igrave", LATIN1_Igrave, "I`", 0x00cc},
al@18840 283 + { "Iota", 0, "I", 0x0399},
al@18840 284 + { "Iuml", LATIN1_Iuml, "I\"", 0x00cf},
al@18840 285 + { "Kappa", 0, "K", 0x039a},
al@18840 286 + { "LT", 0, "<", 0x003c},
al@18840 287 + { "Lambda", 0, "L", 0x039b},
al@18840 288 + { "Mu", 0, "M", 0x039c},
al@18840 289 + { "Ntilde", LATIN1_Ntilde, "N~", 0x00d1},
al@18840 290 + { "Nu", 0, "N", 0x039d},
al@18840 291 + { "OElig", 0, "OE", 0x0152},
al@18840 292 + { "Oacute", LATIN1_Oacute, "O'", 0x00d3},
al@18840 293 + { "Ocirc", LATIN1_Ocirc, "O^", 0x00d4},
al@18840 294 + { "Ograve", LATIN1_Ograve, "O`", 0x00d2},
al@18840 295 + { "Omega", 0, "O", 0x03a9},
al@18840 296 + { "Omicron", 0, "O", 0x039f},
al@18840 297 + { "Oslash", LATIN1_Oslash, "O/", 0x00d8},
al@18840 298 + { "Otilde", LATIN1_Otilde, "O~", 0x00d5},
al@18840 299 + { "Ouml", LATIN1_Ouml, "O\"", 0x00d6},
al@18840 300 + { "Phi", 0, "F", 0x03a6},
al@18840 301 + { "Pi", 0, "P", 0x03a0},
al@18840 302 + { "Prime", 0, "''", },
al@18840 303 + { "Psi", 0, "PS", 0x03a8},
al@18840 304 + { "QUOT", 0, "\"", },
al@18840 305 + { "Rho", 0, "R", 0x03a1},
al@18840 306 + { "Scaron", 0, "S", 0x0161},
al@18840 307 + { "Sigma", 0, "S", 0x03a3},
al@18840 308 + { "THORN", LATIN1_THORN, "TH", 0x00de},
al@18840 309 + { "Tau", 0, "T", 0x03a4},
al@18840 310 + { "Theta", 0, "TH", 0x0398},
al@18840 311 + { "Uacute", LATIN1_Uacute, "U'", 0x00da},
al@18840 312 + { "Ucirc", LATIN1_Ucirc, "U^", 0x00db},
al@18840 313 + { "Ugrave", LATIN1_Ugrave, "U`", 0x00d9},
al@18840 314 + { "Upsilon", 0, "U", 0x03a5},
al@18840 315 + { "Uuml", LATIN1_Uuml, "U\"", 0x00dc},
al@18840 316 + { "Xi", 0, "X", 0x039e},
al@18840 317 + { "Yacute", LATIN1_Yacute, "Y'", 0x00dd},
al@18840 318 + { "Yuml", 0, "Y\"", 0x0178},
al@18840 319 + { "Zeta", 0, "Z", 0x0396},
al@18840 320 + { "aacute", LATIN1_aacute, "a'", 0x00e1},
al@18840 321 + { "acirc", LATIN1_acirc, "a^", 0x00e2},
al@18840 322 + { "acute", LATIN1_acute, "'", 0x00b4},
al@18840 323 + { "aelig", LATIN1_aelig, "ae", 0x00e6},
al@18840 324 + { "agrave", LATIN1_agrave, "a`", 0x00e0},
al@18840 325 { "alefsym", 0, "Aleph" },
al@18840 326 - { "alpha", 0, "a" },
al@18840 327 + { "alpha", 0, "a", 0x03b1},
al@18840 328 { "amp", 0, "&" },
al@18840 329 { "and", 0, "AND" },
al@18840 330 { "ang", 0, "-V" },
al@18840 331 { "apos", 0, "'" },
al@18840 332 - { "aring", LATIN1_aring, "aa" },
al@18840 333 - { "asymp", 0, "~=" },
al@18840 334 - { "atilde", LATIN1_atilde, "a~" },
al@18840 335 - { "auml", LATIN1_auml, "a\"" },
al@18840 336 + { "aring", LATIN1_aring, "aa", 0x00e5},
al@18840 337 + { "asymp", 0, "~=", 0x2248},
al@18840 338 + { "atilde", LATIN1_atilde, "a~", 0x00e3},
al@18840 339 + { "auml", LATIN1_auml, "a\"", 0x00e5},
al@18840 340 { "bdquo", 0, "\"" },
al@18840 341 - { "beta", 0, "b" },
al@18840 342 - { "brvbar", LATIN1_brvbar, "|" },
al@18840 343 - { "bull", 0, " o " },
al@18840 344 + { "beta", 0, "b", 0x03b2},
al@18840 345 + { "brvbar", LATIN1_brvbar, "|", 0x00a6},
al@18840 346 + { "bull", 0, " o ", 0x2022},
al@18840 347 { "cap", 0, "(U" },
al@18840 348 - { "ccedil", LATIN1_ccedil, "c," },
al@18840 349 - { "cedil", LATIN1_cedil, "," },
al@18840 350 - { "cent", LATIN1_cent, "-c-" },
al@18840 351 - { "chi", 0, "h" },
al@18840 352 - { "circ", 0, "^" },
al@18840 353 + { "ccedil", LATIN1_ccedil, "c,", 0x00e7},
al@18840 354 + { "cedil", LATIN1_cedil, ",", 0x00b8},
al@18840 355 + { "cent", LATIN1_cent, "-c-", 0x00a2},
al@18840 356 + { "chi", 0, "h", 0x03c7},
al@18840 357 + { "circ", 0, "^", 0x005e},
al@18840 358 // { "clubs", 0, "[clubs]" },
al@18840 359 { "cong", 0, "?=" },
al@18840 360 - { "copy", LATIN1_copy, "(c)" },
al@18840 361 + { "copy", LATIN1_copy, "(c)", 0x00a9},
al@18840 362 { "crarr", 0, "<-'" },
al@18840 363 { "cup", 0, ")U" },
al@18840 364 - { "curren", LATIN1_curren, "CUR" },
al@18840 365 + { "curren", LATIN1_curren, "CUR", 0x00a4},
al@18840 366 { "dArr", 0, "vv" },
al@18840 367 - { "dagger", 0, "+" },
al@18840 368 + { "dagger", 0, "+", 0x2020},
al@18840 369 { "darr", 0, "v" },
al@18840 370 - { "deg", LATIN1_deg, "DEG" },
al@18840 371 - { "delta", 0, "d" },
al@18840 372 + { "deg", LATIN1_deg, "DEG", 0x00b0},
al@18840 373 + { "delta", 0, "d", 0x03b4},
al@18840 374 // { "diams", 0, "[diamonds]" },
al@18840 375 - { "divide", LATIN1_divide, "/" },
al@18840 376 - { "eacute", LATIN1_eacute, "e'" },
al@18840 377 - { "ecirc", LATIN1_ecirc, "e^" },
al@18840 378 - { "egrave", LATIN1_egrave, "e`" },
al@18840 379 + { "divide", LATIN1_divide, "/", 0x00f7},
al@18840 380 + { "eacute", LATIN1_eacute, "e'", 0x00e9},
al@18840 381 + { "ecirc", LATIN1_ecirc, "e^", 0x00ea},
al@18840 382 + { "egrave", LATIN1_egrave, "e`", 0x00e8},
al@18840 383 { "empty", 0, "{}" },
al@18840 384 - { "epsilon", 0, "e" },
al@18840 385 - { "equiv", 0, "==" },
al@18840 386 - { "eta", 0, "e" },
al@18840 387 - { "eth", LATIN1_eth, "d-" },
al@18840 388 - { "euml", LATIN1_euml, "e\"" },
al@18840 389 - { "euro", 0, "EUR" },
al@18840 390 + { "epsilon", 0, "e", 0x03b5},
al@18840 391 + { "equiv", 0, "==", 0x2261},
al@18840 392 + { "eta", 0, "e", 0x03b7},
al@18840 393 + { "eth", LATIN1_eth, "d-", 0x00f0},
al@18840 394 + { "euml", LATIN1_euml, "e\"", 0x00eb},
al@18840 395 + { "euro", 0, "EUR", 0x20ac},
al@18840 396 { "exist", 0, "TE" },
al@18840 397 { "fnof", 0, "f" },
al@18840 398 { "forall", 0, "FA" },
al@18840 399 - { "frac12", LATIN1_frac12, " 1/2" },
al@18840 400 - { "frac14", LATIN1_frac14, " 1/4" },
al@18840 401 - { "frac34", LATIN1_frac34, " 3/4" },
al@18840 402 + { "frac12", LATIN1_frac12, " 1/2",0x00bd},
al@18840 403 + { "frac14", LATIN1_frac14, " 1/4",0x00bc},
al@18840 404 + { "frac34", LATIN1_frac34, " 3/4",0x00be},
al@18840 405 { "frasl", 0, "/" },
al@18840 406 - { "gamma", 0, "g" },
al@18840 407 - { "ge", 0, ">=" },
al@18840 408 - { "gt", 0, ">" },
al@18840 409 + { "gamma", 0, "g", 0x03b3},
al@18840 410 + { "ge", 0, ">=", 0x2265},
al@18840 411 + { "gt", 0, ">", 0x003e},
al@18840 412 { "hArr", 0, "<=>" },
al@18840 413 { "harr", 0, "<->" },
al@18840 414 // { "hearts", 0, "[hearts]" },
al@18840 415 - { "hellip", 0, "..." },
al@18840 416 - { "iacute", LATIN1_iacute, "i'" },
al@18840 417 - { "icirc", LATIN1_icirc, "i^" },
al@18840 418 - { "iexcl", LATIN1_iexcl, "!" },
al@18840 419 - { "igrave", LATIN1_igrave, "i`" },
al@18840 420 + { "hellip", 0, "...", 0x2026},
al@18840 421 + { "iacute", LATIN1_iacute, "i'", 0x00ed},
al@18840 422 + { "icirc", LATIN1_icirc, "i^", 0x00ee},
al@18840 423 + { "iexcl", LATIN1_iexcl, "!", 0x00a1},
al@18840 424 + { "igrave", LATIN1_igrave, "i`", 0x00ec},
al@18840 425 { "image", 0, "Im" },
al@18840 426 - { "infin", 0, "oo" },
al@18840 427 - { "int", 0, "INT" },
al@18840 428 - { "iota", 0, "i" },
al@18840 429 - { "iquest", LATIN1_iquest, "?" },
al@18840 430 + { "infin", 0, "oo", 0x221e},
al@18840 431 + { "int", 0, "INT", 0x222b},
al@18840 432 + { "iota", 0, "i", 0x03b9},
al@18840 433 + { "iquest", LATIN1_iquest, "?", 0x00bf},
al@18840 434 { "isin", 0, "(-" },
al@18840 435 - { "iuml", LATIN1_iuml, "i\"" },
al@18840 436 - { "kappa", 0, "k" },
al@18840 437 + { "iuml", LATIN1_iuml, "i\"", 0x00ef},
al@18840 438 + { "kappa", 0, "k", 0x03ba},
al@18840 439 { "lArr", 0, "<=" },
al@18840 440 - { "lambda", 0, "l" },
al@18840 441 + { "lambda", 0, "l", 0x03bb},
al@18840 442 { "lang", 0, "</" },
al@18840 443 { "laquo", LATIN1_laquo, "<<" },
al@18840 444 - { "larr", 0, "<-" },
al@18840 445 + { "larr", 0, "<-", 0x2190},
al@18840 446 // { "lceil", 0, "<|" },
al@18840 447 { "ldquo", 0, "\"" },
al@18840 448 - { "le", 0, "<=" },
al@18840 449 + { "le", 0, "<=", 0x2264},
al@18840 450 // { "lfloor", 0, "|<" },
al@18840 451 { "lowast", 0, "*" },
al@18840 452 { "loz", 0, "<>" },
al@18840 453 { "lsaquo", 0, "<" },
al@18840 454 { "lsquo", 0, "`" },
al@18840 455 - { "lt", 0, "<" },
al@18840 456 - { "macr", LATIN1_macr, "-" },
al@18840 457 + { "lt", 0, "<", 0x003c},
al@18840 458 + { "macr", LATIN1_macr, "-", 0x00af},
al@18840 459 { "mdash", 0, "--" },
al@18840 460 - { "micro", LATIN1_micro, "my" },
al@18840 461 - { "middot", LATIN1_middot, "." },
al@18840 462 - { "minus", 0, "-" },
al@18840 463 - { "mu", 0, "m" },
al@18840 464 + { "micro", LATIN1_micro, "my", 0x00b5},
al@18840 465 + { "middot", LATIN1_middot, ".", 0x00b7},
al@18840 466 + { "minus", 0, "-", 0x2212},
al@18840 467 + { "mu", 0, "m", 0x03bc},
al@18840 468 { "nabla", 0, "Nabla" },
al@18840 469 - { "nbsp", LATIN1_nbsp, " " },
al@18840 470 + { "nbsp", LATIN1_nbsp, " ", 0x00a0},
al@18840 471 { "ndash", 0, "-" },
al@18840 472 - { "ne", 0, "!=" },
al@18840 473 + { "ne", 0, "!=", 0x2260},
al@18840 474 { "ni", 0, "-)" },
al@18840 475 { "not", LATIN1_not, "NOT" },
al@18840 476 { "notin", 0, "!(-" },
al@18840 477 { "nsub", 0, "!(C" },
al@18840 478 - { "ntilde", LATIN1_ntilde, "n~" },
al@18840 479 - { "nu", 0, "n" },
al@18840 480 - { "oacute", LATIN1_oacute, "o'" },
al@18840 481 - { "ocirc", LATIN1_ocirc, "o^" },
al@18840 482 + { "ntilde", LATIN1_ntilde, "n~", 0x00f1},
al@18840 483 + { "nu", 0, "n", 0x03bd},
al@18840 484 + { "oacute", LATIN1_oacute, "o'", 0x00f3},
al@18840 485 + { "ocirc", LATIN1_ocirc, "o^", 0x00f4},
al@18840 486 { "oelig", 0, "oe" },
al@18840 487 - { "ograve", LATIN1_ograve, "o`" },
al@18840 488 + { "ograve", LATIN1_ograve, "o`", 0x00f2},
al@18840 489 { "oline", LATIN1_macr, "-" },
al@18840 490 - { "omega", 0, "o" },
al@18840 491 - { "omicron", 0, "o" },
al@18840 492 + { "omega", 0, "o", 0x03c9},
al@18840 493 + { "omicron", 0, "o", 0x03bf},
al@18840 494 { "oplus", 0, "(+)" },
al@18840 495 { "or", 0, "OR" },
al@18840 496 - { "ordf", LATIN1_ordf, "-a" },
al@18840 497 - { "ordm", LATIN1_ordm, "-o" },
al@18840 498 - { "oslash", LATIN1_oslash, "o/" },
al@18840 499 - { "otilde", LATIN1_otilde, "o~" },
al@18840 500 + { "ordf", LATIN1_ordf, "-a", 0x00aa},
al@18840 501 + { "ordm", LATIN1_ordm, "-o", 0x00ba},
al@18840 502 + { "oslash", LATIN1_oslash, "o/", 0x00f8},
al@18840 503 + { "otilde", LATIN1_otilde, "o~", 0x00f5},
al@18840 504 { "otimes", 0, "(x)" },
al@18840 505 - { "ouml", LATIN1_ouml, "o\"" },
al@18840 506 - { "para", LATIN1_para, "P:" },
al@18840 507 - { "part", 0, "PART" },
al@18840 508 - { "permil", 0, " 0/00" },
al@18840 509 + { "ouml", LATIN1_ouml, "o\"", 0x00f6},
al@18840 510 + { "para", LATIN1_para, "P:", 0x00b6},
al@18840 511 + { "part", 0, "PART",0x2202},
al@18840 512 + { "permil", 0, " 0/00",0x2030},
al@18840 513 { "perp", 0, "-T" },
al@18840 514 - { "phi", 0, "f" },
al@18840 515 - { "pi", 0, "p" },
al@18840 516 + { "phi", 0, "f", 0x03c6},
al@18840 517 + { "pi", 0, "p", 0x03c0},
al@18840 518 { "piv", 0, "Pi" },
al@18840 519 - { "plusmn", LATIN1_plusmn, "+/-" },
al@18840 520 - { "pound", LATIN1_pound, "-L-" },
al@18840 521 + { "plusmn", LATIN1_plusmn, "+/-", 0x00b1},
al@18840 522 + { "pound", LATIN1_pound, "-L-", 0x00a3},
al@18840 523 { "prime", 0, "'" },
al@18840 524 - { "prod", 0, "PROD" },
al@18840 525 + { "prod", 0, "PROD",0x220f},
al@18840 526 { "prop", 0, "0(" },
al@18840 527 - { "psi", 0, "ps" },
al@18840 528 + { "psi", 0, "ps", 0x03c8},
al@18840 529 { "quot", 0, "\"" },
al@18840 530 { "rArr", 0, "=>" },
al@18840 531 - { "radic", 0, "SQRT" },
al@18840 532 + { "radic", 0, "SQRT",0x221a},
al@18840 533 { "rang", 0, "/>" },
al@18840 534 { "raquo", LATIN1_raquo, ">>" },
al@18840 535 - { "rarr", 0, "->" },
al@18840 536 + { "rarr", 0, "->", 0x2192},
al@18840 537 // { "rceil", 0, ">|" },
al@18840 538 { "rdquo", 0, "\"" },
al@18840 539 { "real", 0, "Re" },
al@18840 540 - { "reg", LATIN1_reg, "(R)" },
al@18840 541 + { "reg", LATIN1_reg, "(R)", 0x00ae},
al@18840 542 // { "rfloor", 0, "|>" },
al@18840 543 - { "rho", 0, "r" },
al@18840 544 + { "rho", 0, "r", 0x03c1},
al@18840 545 { "rsaquo", 0, ">" },
al@18840 546 { "rsquo", 0, "'" },
al@18840 547 { "sbquo", 0, "'" },
al@18840 548 - { "scaron", 0, "s" },
al@18840 549 + { "scaron", 0, "s", 0x0161},
al@18840 550 { "sdot", 0, "DOT" },
al@18840 551 - { "sect", LATIN1_sect, "S:" },
al@18840 552 + { "sect", LATIN1_sect, "S:", 0x00a7},
al@18840 553 { "shy", LATIN1_shy, "" },
al@18840 554 - { "sigma", 0, "s" },
al@18840 555 - { "sigmaf", 0, "s" },
al@18840 556 + { "sigma", 0, "s", 0x03c3},
al@18840 557 + { "sigmaf", 0, "s", 0x03c2},
al@18840 558 { "sim", 0, "~" },
al@18840 559 // { "spades", 0, "[spades]" },
al@18840 560 { "sub", 0, "(C" },
al@18840 561 { "sube", 0, "(_" },
al@18840 562 - { "sum", 0, "SUM" },
al@18840 563 + { "sum", 0, "SUM", 0x2211},
al@18840 564 { "sup", 0, ")C" },
al@18840 565 - { "sup1", LATIN1_sup1, "^1" },
al@18840 566 - { "sup2", LATIN1_sup2, "^2" },
al@18840 567 - { "sup3", LATIN1_sup3, "^3" },
al@18840 568 + { "sup1", LATIN1_sup1, "^1", 0x00b9},
al@18840 569 + { "sup2", LATIN1_sup2, "^2", 0x00b2},
al@18840 570 + { "sup3", LATIN1_sup3, "^3", 0x00b3},
al@18840 571 { "supe", 0, ")_" },
al@18840 572 - { "szlig", LATIN1_szlig, "ss" },
al@18840 573 - { "tau", 0, "t" },
al@18840 574 + { "szlig", LATIN1_szlig, "ss", 0x00df},
al@18840 575 + { "tau", 0, "t", 0x03c4},
al@18840 576 { "there4", 0, ".:" },
al@18840 577 - { "theta", 0, "th" },
al@18840 578 - { "thorn", LATIN1_thorn, "th" },
al@18840 579 - { "tilde", 0, "~" },
al@18840 580 - { "times", LATIN1_times, "x" },
al@18840 581 - { "trade", 0, "[TM]" },
al@18840 582 + { "theta", 0, "th", 0x03b8},
al@18840 583 + { "thorn", LATIN1_thorn, "th", 0x00fe},
al@18840 584 + { "tilde", 0, "~", 0x02dc},
al@18840 585 + { "times", LATIN1_times, "x", 0x00d7},
al@18840 586 + { "trade", 0, "[TM]",0x2122},
al@18840 587 { "uArr", 0, "^^" },
al@18840 588 - { "uacute", LATIN1_uacute, "u'" },
al@18840 589 + { "uacute", LATIN1_uacute, "u'", 0x00fa},
al@18840 590 { "uarr", 0, "^" },
al@18840 591 - { "ucirc", LATIN1_ucirc, "u^" },
al@18840 592 - { "ugrave", LATIN1_ugrave, "u`" },
al@18840 593 - { "uml", LATIN1_uml, "\"" },
al@18840 594 - { "upsilon", 0, "u" },
al@18840 595 - { "uuml", LATIN1_uuml, "u\"" },
al@18840 596 + { "ucirc", LATIN1_ucirc, "u^", 0x00fb},
al@18840 597 + { "ugrave", LATIN1_ugrave, "u`", 0x00f9},
al@18840 598 + { "uml", LATIN1_uml, "\"", 0x00a8},
al@18840 599 + { "upsilon", 0, "u", 0x03c5},
al@18840 600 + { "uuml", LATIN1_uuml, "u\"", 0x00fc},
al@18840 601 { "weierp", 0, "P" },
al@18840 602 - { "xi", 0, "x" },
al@18840 603 - { "yacute", LATIN1_yacute, "y'" },
al@18840 604 - { "yen", LATIN1_yen, "YEN" },
al@18840 605 - { "yuml", LATIN1_yuml, "y\"" },
al@18840 606 - { "zeta", 0, "z" },
al@18840 607 + { "xi", 0, "x", 0x03be},
al@18840 608 + { "yacute", LATIN1_yacute, "y'", 0x00fd},
al@18840 609 + { "yen", LATIN1_yen, "YEN", 0x00a5},
al@18840 610 + { "yuml", LATIN1_yuml, "y\"", 0x00ff},
al@18840 611 + { "zeta", 0, "z", 0x03b6},
al@18840 612 };
al@18840 613
al@18840 614 -extern int use_iso8859;
al@18840 615 +extern int use_encoding;
al@18840 616
al@18840 617 /* ------------------------------------------------------------------------- */
al@18840 618
al@18840 619 +char ubuf[4];
al@18840 620 +
al@18840 621 +char *mkutf(unsigned long x)
al@18840 622 +{
al@18840 623 + memset(ubuf, 0, 4);
al@18840 624 + if(x < 128) ubuf[0] = x;
al@18840 625 + else if(x < 0x800) {
al@18840 626 + ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));
al@18840 627 + ubuf[1] = (0x80 | (x & 0x3f));
al@18840 628 + }
al@18840 629 + else {
al@18840 630 + ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));
al@18840 631 + ubuf[1] = (0x80 | ((x >> 6) & 0x3f));
al@18840 632 + ubuf[2] = (0x80 | (x & 0x3f));
al@18840 633 + }
al@18840 634 + return ubuf;
al@18840 635 +}
al@18840 636 +
al@18840 637 void
al@18840 638 replace_sgml_entities(string *s)
al@18840 639 {
al@18840 640 @@ -330,9 +349,9 @@
al@18840 641 */
al@18840 642 while (j < l && s->at(j) != '&') ++j;
al@18840 643 /*
al@18840 644 - * We could convert high-bit chars to "&#233;" here if use_iso8859
al@18840 645 - * is off, then let them be translated or not. Is the purpose of
al@18840 646 - * !use_iso8859 to allow SGML entities to be seen, or to strongly
al@18840 647 + * We could convert high-bit chars to "&#233;" here if USE_ASCII
al@18840 648 + * is on, then let them be translated or not. Is the purpose of
al@18840 649 + * USE_ASCII to allow SGML entities to be seen, or to strongly
al@18840 650 * filter against high-ASCII chars that might blow up a terminal
al@18840 651 * that doesn't speak ISO8859? For the moment, "allow SGML entities
al@18840 652 * to be seen" -- no filtering here.
al@18840 653 @@ -370,7 +389,11 @@
al@18840 654 if (!isdigit(c)) break;
al@18840 655 x = 10 * x + c - '0';
al@18840 656 }
al@18840 657 - if (use_iso8859 || (x < 128)) {
al@18840 658 + if (USE_UTF8) {
al@18840 659 + s->replace(beg, j - beg, mkutf(x));
al@18840 660 + j = beg + 1;
al@18840 661 + }
al@18840 662 + else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {
al@18840 663 s->replace(beg, j - beg, 1, (char) x);
al@18840 664 j = beg + 1;
al@18840 665 } else {
al@18840 666 @@ -408,13 +431,17 @@
al@18840 667 (int (*)(const void *, const void *)) strcmp
al@18840 668 );
al@18840 669 if (entity != NULL) {
al@18840 670 - if (use_iso8859 && entity->iso8859code) {
al@18840 671 + if (USE_ISO8859 && entity->iso8859code) {
al@18840 672 s->replace(beg, j - beg, 1, (char) entity->iso8859code);
al@18840 673 j = beg + 1;
al@18840 674 - } else if (entity->asciistr) {
al@18840 675 + } else if (USE_ASCII && entity->asciistr) {
al@18840 676 s->replace(beg, j - beg, entity->asciistr);
al@18840 677 j = beg + 1;
al@18840 678 } /* else don't replace it at all, we don't have a translation */
al@18840 679 + else if(USE_UTF8 && entity->unicode) {
al@18840 680 + s->replace(beg, j - beg, mkutf(entity->unicode));
al@18840 681 + j = beg + 1;
al@18840 682 + }
al@18840 683 }
al@18840 684 } else {
al@18840 685 ; /* EXTENSION: Allow literal '&' sometimes. */
al@18840 686 diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C
al@18840 687 --- html2text-1.3.2a/table.C 2002-07-22 13:32:50.000000000 +0200
al@18840 688 +++ html2text-1.3.2a-patched/table.C 2005-05-13 22:19:59.871136320 +0200
al@18840 689 @@ -175,7 +175,7 @@
al@18840 690 - (*number_of_columns_return - 1) * (column_spacing + 0),
al@18840 691 Area::LEFT // Yields better results than "p->halign"!
al@18840 692 ));
al@18840 693 - p->width = tmp.get() ? tmp->width() : 0;
al@18840 694 + p->width = tmp.get() ? tmp->utf_width() : 0;
al@18840 695 }
al@18840 696 p->minimized = false;
al@18840 697
al@18840 698 @@ -308,7 +308,7 @@
al@18840 699 left_of_column + old_column_width - 1,
al@18840 700 Area::LEFT // Yields better results than "lc.halign"!
al@18840 701 ));
al@18840 702 - w = tmp->width();
al@18840 703 + w = tmp->utf_width();
al@18840 704 if (w >= left_of_column + old_column_width) lc.minimized = true;
al@18840 705 }
al@18840 706 if (w > left_of_column + new_column_width) {