wok-current diff html2text/stuff/patch-utf8-html2text-1.3.2a.diff @ rev 19387
dialog: update dialogrc
author | Pascal Bellard <pascal.bellard@slitaz.org> |
---|---|
date | Sun Aug 21 12:51:53 2016 +0200 (2016-08-21) |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/html2text/stuff/patch-utf8-html2text-1.3.2a.diff Sun Aug 21 12:51:53 2016 +0200 1.3 @@ -0,0 +1,706 @@ 1.4 +diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C 1.5 +--- html2text-1.3.2a/Area.C 2003-11-23 12:05:29.000000000 +0100 1.6 ++++ html2text-1.3.2a-patched/Area.C 2005-05-13 22:19:59.862137688 +0200 1.7 +@@ -36,10 +36,13 @@ 1.8 + #include <iostream> 1.9 + 1.10 + #include "Area.h" 1.11 ++#include "html.h" 1.12 + #include "string.h" 1.13 + 1.14 + #define LATIN1_nbsp 160 1.15 + 1.16 ++extern int use_encoding; 1.17 ++ 1.18 + /* ------------------------------------------------------------------------- */ 1.19 + 1.20 + #define malloc_array(type, size)\ 1.21 +@@ -81,6 +84,27 @@ 1.22 + 1.23 + /* ------------------------------------------------------------------------- */ 1.24 + 1.25 ++/* utf_length() and utf_width() 1.26 ++ * 1.27 ++ * Very simplified algorithm of calculating length of UTF-8 1.28 ++ * string. No check for errors. Counting only ASCII bytes and 1.29 ++ * leading bytes of UTF-8 multibyte sequences. All bytes like 1.30 ++ * 10xxxxxx are dropped. If USE_UTF8 is false then returns 1.31 ++ * usual length. --YS 1.32 ++ */ 1.33 ++ 1.34 ++unsigned int 1.35 ++Line::utf_length(size_type f, size_type t) const 1.36 ++{ 1.37 ++ size_type m = (t < length_ ? t : length_); 1.38 ++ size_type r = m - f; 1.39 ++ if(USE_UTF8) { 1.40 ++ for (int i = f; i < m; i++) 1.41 ++ if((cells_[i].character & 0xc0) == 0x80) r--; 1.42 ++ } 1.43 ++ return r; 1.44 ++} 1.45 ++ 1.46 + void 1.47 + Line::resize(size_type l) 1.48 + { 1.49 +@@ -236,6 +260,23 @@ 1.50 + return *this; 1.51 + } 1.52 + 1.53 ++unsigned int 1.54 ++Area::utf_width() 1.55 ++{ 1.56 ++ size_type r = width_; 1.57 ++ if(USE_UTF8) { r = 0; 1.58 ++ for (size_type yy = 0; yy < height_; yy++) { 1.59 ++ size_type r1 = 0; 1.60 ++ for (int i = width_ - 1; i >= 0; i--) { 1.61 ++ if(!r1 && isspace(cells_[yy][i].character)) continue; 1.62 ++ if((cells_[yy][i].character & 0xc0) != 0x80) r1++; 1.63 ++ } 1.64 ++ if(r < r1) r = r1; 1.65 ++ } 1.66 ++ } 1.67 ++ return r; 1.68 ++} 1.69 ++ 1.70 + void 1.71 + Area::resize(size_type w, size_type h) 1.72 + { 1.73 +@@ -439,7 +480,7 @@ 1.74 + char c = p->character; 1.75 + char a = p->attribute; 1.76 + 1.77 +- if (c == (char) LATIN1_nbsp) c = ' '; 1.78 ++ if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' '; 1.79 + 1.80 + if (a == Cell::NONE) { 1.81 + os << c; 1.82 +Nur in html2text-1.3.2a-patched/: Area.C.orig. 1.83 +diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h 1.84 +--- html2text-1.3.2a/Area.h 2003-11-23 12:05:29.000000000 +0100 1.85 ++++ html2text-1.3.2a-patched/Area.h 2005-05-13 22:19:59.863137536 +0200 1.86 +@@ -81,6 +81,8 @@ 1.87 + Cell &operator[](size_type x) { return cells_[x]; } 1.88 + const Cell *cells() const { return cells_; } 1.89 + 1.90 ++ unsigned int utf_length(size_type f, size_type t) const; 1.91 ++ 1.92 + void resize(size_type l); 1.93 + void enlarge(size_type l) { if (l > length_) resize(l); } 1.94 + 1.95 +@@ -134,6 +136,8 @@ 1.96 + Cell *operator[](size_type y) { return cells_[y]; } 1.97 + const Area &operator>>=(size_type rs); 1.98 + 1.99 ++ unsigned int utf_width(); 1.100 ++ 1.101 + void resize(size_type w, size_type h); 1.102 + void enlarge(size_type w, size_type h); 1.103 + 1.104 +Nur in html2text-1.3.2a-patched/: Area.h.orig. 1.105 +diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C 1.106 +--- html2text-1.3.2a/format.C 2003-11-23 12:05:29.000000000 +0100 1.107 ++++ html2text-1.3.2a-patched/format.C 2005-05-13 22:19:59.865137232 +0200 1.108 +@@ -1210,6 +1210,7 @@ 1.109 + } 1.110 + 1.111 + Line::size_type to = from + 1; 1.112 ++ int to_from; 1.113 + 1.114 + Line::size_type lbp = (Line::size_type) -1; // "Last break position". 1.115 + 1.116 +@@ -1238,18 +1239,20 @@ 1.117 + to++; 1.118 + } 1.119 + 1.120 +- if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; } 1.121 ++ if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) 1.122 ++ { to = lbp; break; } 1.123 + } 1.124 + 1.125 ++ to_from = line.utf_length(from,to); 1.126 + /* 1.127 + * Copy the "from...to" range from the "line" to the bottom of the "res" 1.128 + * Area. 1.129 + */ 1.130 + Area::size_type x = 0; 1.131 + Area::size_type len = to - from; 1.132 +- if (halign == Area::LEFT || len >= w) { ; } else 1.133 +- if (halign == Area::CENTER) { x += (w - len) / 2; } else 1.134 +- if (halign == Area::RIGHT) { x += w - len; } 1.135 ++ if (halign == Area::LEFT || to_from >= w) { ; } else 1.136 ++ if (halign == Area::CENTER) { x += (w - to_from) / 2; } else 1.137 ++ if (halign == Area::RIGHT) { x += w - to_from; } 1.138 + res->insert(line.cells() + from, len, x, res->height()); 1.139 + 1.140 + /* 1.141 +Nur in html2text-1.3.2a-patched/: format.C.orig. 1.142 +diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C 1.143 +--- html2text-1.3.2a/html2text.C 2003-11-23 12:05:29.000000000 +0100 1.144 ++++ html2text-1.3.2a-patched/html2text.C 2005-05-13 22:19:59.868136776 +0200 1.145 +@@ -148,9 +148,10 @@ 1.146 + -o <file> Redirect output into <file>\n\ 1.147 + -nobs Do not use backspaces for boldface and underlining\n\ 1.148 + -ascii Use plain ASCII for output instead of ISO-8859-1\n\ 1.149 ++ -utf8 Assume both terminal and input stream are in UTF-8 mode\n\ 1.150 + "; 1.151 + 1.152 +-int use_iso8859 = 1; 1.153 ++int use_encoding = ISO8859; 1.154 + 1.155 + int 1.156 + main(int argc, char **argv) 1.157 +@@ -199,7 +200,8 @@ 1.158 + if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else 1.159 + if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else 1.160 + if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else 1.161 +- if (!strcmp(arg, "-ascii" )) { use_iso8859 = false; } else 1.162 ++ if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else 1.163 ++ if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else 1.164 + { 1.165 + std::cerr 1.166 + << "Unrecognized command line option \"" 1.167 +Nur in html2text-1.3.2a-patched/: html2text.C.orig. 1.168 +diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h 1.169 +--- html2text-1.3.2a/html.h 2001-10-04 22:03:54.000000000 +0200 1.170 ++++ html2text-1.3.2a-patched/html.h 2005-05-13 22:19:59.866137080 +0200 1.171 +@@ -61,6 +61,11 @@ 1.172 + 1.173 + /* ------------------------------------------------------------------------- */ 1.174 + 1.175 ++enum {ASCII, ISO8859, UTF8}; 1.176 ++#define USE_ISO8859 (use_encoding == ISO8859) 1.177 ++#define USE_ASCII (use_encoding == ASCII) 1.178 ++#define USE_UTF8 (use_encoding == UTF8) 1.179 ++ 1.180 + #define LATIN1_nbsp 160 1.181 + #define LATIN1_iexcl 161 1.182 + #define LATIN1_cent 162 1.183 +diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C 1.184 +--- html2text-1.3.2a/sgml.C 2003-11-23 12:09:11.000000000 +0100 1.185 ++++ html2text-1.3.2a-patched/sgml.C 2005-05-13 22:19:59.870136472 +0200 1.186 +@@ -62,261 +62,280 @@ 1.187 + char name[8]; 1.188 + int iso8859code; 1.189 + char *asciistr; 1.190 ++ unsigned long unicode; 1.191 + } entities[] = { 1.192 +- { "AElig", LATIN1_AElig, "AE" }, 1.193 +- { "AMP", 0, "&" }, 1.194 +- { "Aacute", LATIN1_Aacute, "A'" }, 1.195 +- { "Acirc", LATIN1_Acirc, "A^" }, 1.196 +- { "Agrave", LATIN1_Agrave, "A`" }, 1.197 +- { "Alpha", 0, "A" }, 1.198 +- { "Aring", LATIN1_Aring, "AA" }, 1.199 +- { "Atilde", LATIN1_Atilde, "A~" }, 1.200 +- { "Auml", LATIN1_Auml, "A\"" }, 1.201 +- { "Beta", 0, "B" }, 1.202 +- { "Ccedil", LATIN1_Ccedil, "C," }, 1.203 +- { "Chi", 0, "H" }, 1.204 +- { "Dagger", 0, "++" }, 1.205 +- { "Delta", 0, "D" }, 1.206 +- { "ETH", LATIN1_ETH, "D-" }, 1.207 +- { "Eacute", LATIN1_Eacute, "E'" }, 1.208 +- { "Ecirc", LATIN1_Ecirc, "E^" }, 1.209 +- { "Egrave", LATIN1_Egrave, "E`" }, 1.210 +- { "Epsilon", 0, "E" }, 1.211 +- { "Eta", 0, "E" }, 1.212 +- { "Euml", LATIN1_Euml, "E\"" }, 1.213 +- { "GT", 0, ">" }, 1.214 +- { "Gamma", 0, "G" }, 1.215 +- { "Iacute", LATIN1_Iacute, "I'" }, 1.216 +- { "Icirc", LATIN1_Icirc, "I^" }, 1.217 +- { "Igrave", LATIN1_Igrave, "I`" }, 1.218 +- { "Iota", 0, "I" }, 1.219 +- { "Iuml", LATIN1_Iuml, "I\"" }, 1.220 +- { "Kappa", 0, "K" }, 1.221 +- { "LT", 0, "<" }, 1.222 +- { "Lambda", 0, "L" }, 1.223 +- { "Mu", 0, "M" }, 1.224 +- { "Ntilde", LATIN1_Ntilde, "N~" }, 1.225 +- { "Nu", 0, "N" }, 1.226 +- { "OElig", 0, "OE" }, 1.227 +- { "Oacute", LATIN1_Oacute, "O'" }, 1.228 +- { "Ocirc", LATIN1_Ocirc, "O^" }, 1.229 +- { "Ograve", LATIN1_Ograve, "O`" }, 1.230 +- { "Omega", 0, "O" }, 1.231 +- { "Omicron", 0, "O" }, 1.232 +- { "Oslash", LATIN1_Oslash, "O/" }, 1.233 +- { "Otilde", LATIN1_Otilde, "O~" }, 1.234 +- { "Ouml", LATIN1_Ouml, "O\"" }, 1.235 +- { "Phi", 0, "F" }, 1.236 +- { "Pi", 0, "P" }, 1.237 +- { "Prime", 0, "''" }, 1.238 +- { "Psi", 0, "PS" }, 1.239 +- { "QUOT", 0, "\"" }, 1.240 +- { "Rho", 0, "R" }, 1.241 +- { "Scaron", 0, "S" }, 1.242 +- { "Sigma", 0, "S" }, 1.243 +- { "THORN", LATIN1_THORN, "TH" }, 1.244 +- { "Tau", 0, "T" }, 1.245 +- { "Theta", 0, "TH" }, 1.246 +- { "Uacute", LATIN1_Uacute, "U'" }, 1.247 +- { "Ucirc", LATIN1_Ucirc, "U^" }, 1.248 +- { "Ugrave", LATIN1_Ugrave, "U`" }, 1.249 +- { "Upsilon", 0, "U" }, 1.250 +- { "Uuml", LATIN1_Uuml, "U\"" }, 1.251 +- { "Xi", 0, "X" }, 1.252 +- { "Yacute", LATIN1_Yacute, "Y'" }, 1.253 +- { "Yuml", 0, "Y\"" }, 1.254 +- { "Zeta", 0, "Z" }, 1.255 +- { "aacute", LATIN1_aacute, "a'" }, 1.256 +- { "acirc", LATIN1_acirc, "a^" }, 1.257 +- { "acute", LATIN1_acute, "'" }, 1.258 +- { "aelig", LATIN1_aelig, "ae" }, 1.259 +- { "agrave", LATIN1_agrave, "a`" }, 1.260 ++ { "AElig", LATIN1_AElig, "AE", 0x00c6}, 1.261 ++ { "AMP", 0, "&", 0x0026}, 1.262 ++ { "Aacute", LATIN1_Aacute, "A'", 0x00c1}, 1.263 ++ { "Acirc", LATIN1_Acirc, "A^", 0x00c2}, 1.264 ++ { "Agrave", LATIN1_Agrave, "A`", 0x00c0}, 1.265 ++ { "Alpha", 0, "A", 0x0391}, 1.266 ++ { "Aring", LATIN1_Aring, "AA", 0x00c5}, 1.267 ++ { "Atilde", LATIN1_Atilde, "A~", 0x00c3}, 1.268 ++ { "Auml", LATIN1_Auml, "A\"", 0x00c4}, 1.269 ++ { "Beta", 0, "B", 0x0392}, 1.270 ++ { "Ccedil", LATIN1_Ccedil, "C,", 0x00c7}, 1.271 ++ { "Chi", 0, "H", 0x03a7}, 1.272 ++ { "Dagger", 0, "++", 0x2020}, 1.273 ++ { "Delta", 0, "D", 0x0394}, 1.274 ++ { "ETH", LATIN1_ETH, "D-", 0x00d0}, 1.275 ++ { "Eacute", LATIN1_Eacute, "E'", 0x00c9}, 1.276 ++ { "Ecirc", LATIN1_Ecirc, "E^", 0x00ca}, 1.277 ++ { "Egrave", LATIN1_Egrave, "E`", 0x00c8}, 1.278 ++ { "Epsilon", 0, "E", 0x0395}, 1.279 ++ { "Eta", 0, "E", 0x0397}, 1.280 ++ { "Euml", LATIN1_Euml, "E\"", 0x00cb}, 1.281 ++ { "GT", 0, ">", 0x003e}, 1.282 ++ { "Gamma", 0, "G", 0x0393}, 1.283 ++ { "Iacute", LATIN1_Iacute, "I'", 0x00cd}, 1.284 ++ { "Icirc", LATIN1_Icirc, "I^", 0x00ce}, 1.285 ++ { "Igrave", LATIN1_Igrave, "I`", 0x00cc}, 1.286 ++ { "Iota", 0, "I", 0x0399}, 1.287 ++ { "Iuml", LATIN1_Iuml, "I\"", 0x00cf}, 1.288 ++ { "Kappa", 0, "K", 0x039a}, 1.289 ++ { "LT", 0, "<", 0x003c}, 1.290 ++ { "Lambda", 0, "L", 0x039b}, 1.291 ++ { "Mu", 0, "M", 0x039c}, 1.292 ++ { "Ntilde", LATIN1_Ntilde, "N~", 0x00d1}, 1.293 ++ { "Nu", 0, "N", 0x039d}, 1.294 ++ { "OElig", 0, "OE", 0x0152}, 1.295 ++ { "Oacute", LATIN1_Oacute, "O'", 0x00d3}, 1.296 ++ { "Ocirc", LATIN1_Ocirc, "O^", 0x00d4}, 1.297 ++ { "Ograve", LATIN1_Ograve, "O`", 0x00d2}, 1.298 ++ { "Omega", 0, "O", 0x03a9}, 1.299 ++ { "Omicron", 0, "O", 0x039f}, 1.300 ++ { "Oslash", LATIN1_Oslash, "O/", 0x00d8}, 1.301 ++ { "Otilde", LATIN1_Otilde, "O~", 0x00d5}, 1.302 ++ { "Ouml", LATIN1_Ouml, "O\"", 0x00d6}, 1.303 ++ { "Phi", 0, "F", 0x03a6}, 1.304 ++ { "Pi", 0, "P", 0x03a0}, 1.305 ++ { "Prime", 0, "''", }, 1.306 ++ { "Psi", 0, "PS", 0x03a8}, 1.307 ++ { "QUOT", 0, "\"", }, 1.308 ++ { "Rho", 0, "R", 0x03a1}, 1.309 ++ { "Scaron", 0, "S", 0x0161}, 1.310 ++ { "Sigma", 0, "S", 0x03a3}, 1.311 ++ { "THORN", LATIN1_THORN, "TH", 0x00de}, 1.312 ++ { "Tau", 0, "T", 0x03a4}, 1.313 ++ { "Theta", 0, "TH", 0x0398}, 1.314 ++ { "Uacute", LATIN1_Uacute, "U'", 0x00da}, 1.315 ++ { "Ucirc", LATIN1_Ucirc, "U^", 0x00db}, 1.316 ++ { "Ugrave", LATIN1_Ugrave, "U`", 0x00d9}, 1.317 ++ { "Upsilon", 0, "U", 0x03a5}, 1.318 ++ { "Uuml", LATIN1_Uuml, "U\"", 0x00dc}, 1.319 ++ { "Xi", 0, "X", 0x039e}, 1.320 ++ { "Yacute", LATIN1_Yacute, "Y'", 0x00dd}, 1.321 ++ { "Yuml", 0, "Y\"", 0x0178}, 1.322 ++ { "Zeta", 0, "Z", 0x0396}, 1.323 ++ { "aacute", LATIN1_aacute, "a'", 0x00e1}, 1.324 ++ { "acirc", LATIN1_acirc, "a^", 0x00e2}, 1.325 ++ { "acute", LATIN1_acute, "'", 0x00b4}, 1.326 ++ { "aelig", LATIN1_aelig, "ae", 0x00e6}, 1.327 ++ { "agrave", LATIN1_agrave, "a`", 0x00e0}, 1.328 + { "alefsym", 0, "Aleph" }, 1.329 +- { "alpha", 0, "a" }, 1.330 ++ { "alpha", 0, "a", 0x03b1}, 1.331 + { "amp", 0, "&" }, 1.332 + { "and", 0, "AND" }, 1.333 + { "ang", 0, "-V" }, 1.334 + { "apos", 0, "'" }, 1.335 +- { "aring", LATIN1_aring, "aa" }, 1.336 +- { "asymp", 0, "~=" }, 1.337 +- { "atilde", LATIN1_atilde, "a~" }, 1.338 +- { "auml", LATIN1_auml, "a\"" }, 1.339 ++ { "aring", LATIN1_aring, "aa", 0x00e5}, 1.340 ++ { "asymp", 0, "~=", 0x2248}, 1.341 ++ { "atilde", LATIN1_atilde, "a~", 0x00e3}, 1.342 ++ { "auml", LATIN1_auml, "a\"", 0x00e5}, 1.343 + { "bdquo", 0, "\"" }, 1.344 +- { "beta", 0, "b" }, 1.345 +- { "brvbar", LATIN1_brvbar, "|" }, 1.346 +- { "bull", 0, " o " }, 1.347 ++ { "beta", 0, "b", 0x03b2}, 1.348 ++ { "brvbar", LATIN1_brvbar, "|", 0x00a6}, 1.349 ++ { "bull", 0, " o ", 0x2022}, 1.350 + { "cap", 0, "(U" }, 1.351 +- { "ccedil", LATIN1_ccedil, "c," }, 1.352 +- { "cedil", LATIN1_cedil, "," }, 1.353 +- { "cent", LATIN1_cent, "-c-" }, 1.354 +- { "chi", 0, "h" }, 1.355 +- { "circ", 0, "^" }, 1.356 ++ { "ccedil", LATIN1_ccedil, "c,", 0x00e7}, 1.357 ++ { "cedil", LATIN1_cedil, ",", 0x00b8}, 1.358 ++ { "cent", LATIN1_cent, "-c-", 0x00a2}, 1.359 ++ { "chi", 0, "h", 0x03c7}, 1.360 ++ { "circ", 0, "^", 0x005e}, 1.361 + // { "clubs", 0, "[clubs]" }, 1.362 + { "cong", 0, "?=" }, 1.363 +- { "copy", LATIN1_copy, "(c)" }, 1.364 ++ { "copy", LATIN1_copy, "(c)", 0x00a9}, 1.365 + { "crarr", 0, "<-'" }, 1.366 + { "cup", 0, ")U" }, 1.367 +- { "curren", LATIN1_curren, "CUR" }, 1.368 ++ { "curren", LATIN1_curren, "CUR", 0x00a4}, 1.369 + { "dArr", 0, "vv" }, 1.370 +- { "dagger", 0, "+" }, 1.371 ++ { "dagger", 0, "+", 0x2020}, 1.372 + { "darr", 0, "v" }, 1.373 +- { "deg", LATIN1_deg, "DEG" }, 1.374 +- { "delta", 0, "d" }, 1.375 ++ { "deg", LATIN1_deg, "DEG", 0x00b0}, 1.376 ++ { "delta", 0, "d", 0x03b4}, 1.377 + // { "diams", 0, "[diamonds]" }, 1.378 +- { "divide", LATIN1_divide, "/" }, 1.379 +- { "eacute", LATIN1_eacute, "e'" }, 1.380 +- { "ecirc", LATIN1_ecirc, "e^" }, 1.381 +- { "egrave", LATIN1_egrave, "e`" }, 1.382 ++ { "divide", LATIN1_divide, "/", 0x00f7}, 1.383 ++ { "eacute", LATIN1_eacute, "e'", 0x00e9}, 1.384 ++ { "ecirc", LATIN1_ecirc, "e^", 0x00ea}, 1.385 ++ { "egrave", LATIN1_egrave, "e`", 0x00e8}, 1.386 + { "empty", 0, "{}" }, 1.387 +- { "epsilon", 0, "e" }, 1.388 +- { "equiv", 0, "==" }, 1.389 +- { "eta", 0, "e" }, 1.390 +- { "eth", LATIN1_eth, "d-" }, 1.391 +- { "euml", LATIN1_euml, "e\"" }, 1.392 +- { "euro", 0, "EUR" }, 1.393 ++ { "epsilon", 0, "e", 0x03b5}, 1.394 ++ { "equiv", 0, "==", 0x2261}, 1.395 ++ { "eta", 0, "e", 0x03b7}, 1.396 ++ { "eth", LATIN1_eth, "d-", 0x00f0}, 1.397 ++ { "euml", LATIN1_euml, "e\"", 0x00eb}, 1.398 ++ { "euro", 0, "EUR", 0x20ac}, 1.399 + { "exist", 0, "TE" }, 1.400 + { "fnof", 0, "f" }, 1.401 + { "forall", 0, "FA" }, 1.402 +- { "frac12", LATIN1_frac12, " 1/2" }, 1.403 +- { "frac14", LATIN1_frac14, " 1/4" }, 1.404 +- { "frac34", LATIN1_frac34, " 3/4" }, 1.405 ++ { "frac12", LATIN1_frac12, " 1/2",0x00bd}, 1.406 ++ { "frac14", LATIN1_frac14, " 1/4",0x00bc}, 1.407 ++ { "frac34", LATIN1_frac34, " 3/4",0x00be}, 1.408 + { "frasl", 0, "/" }, 1.409 +- { "gamma", 0, "g" }, 1.410 +- { "ge", 0, ">=" }, 1.411 +- { "gt", 0, ">" }, 1.412 ++ { "gamma", 0, "g", 0x03b3}, 1.413 ++ { "ge", 0, ">=", 0x2265}, 1.414 ++ { "gt", 0, ">", 0x003e}, 1.415 + { "hArr", 0, "<=>" }, 1.416 + { "harr", 0, "<->" }, 1.417 + // { "hearts", 0, "[hearts]" }, 1.418 +- { "hellip", 0, "..." }, 1.419 +- { "iacute", LATIN1_iacute, "i'" }, 1.420 +- { "icirc", LATIN1_icirc, "i^" }, 1.421 +- { "iexcl", LATIN1_iexcl, "!" }, 1.422 +- { "igrave", LATIN1_igrave, "i`" }, 1.423 ++ { "hellip", 0, "...", 0x2026}, 1.424 ++ { "iacute", LATIN1_iacute, "i'", 0x00ed}, 1.425 ++ { "icirc", LATIN1_icirc, "i^", 0x00ee}, 1.426 ++ { "iexcl", LATIN1_iexcl, "!", 0x00a1}, 1.427 ++ { "igrave", LATIN1_igrave, "i`", 0x00ec}, 1.428 + { "image", 0, "Im" }, 1.429 +- { "infin", 0, "oo" }, 1.430 +- { "int", 0, "INT" }, 1.431 +- { "iota", 0, "i" }, 1.432 +- { "iquest", LATIN1_iquest, "?" }, 1.433 ++ { "infin", 0, "oo", 0x221e}, 1.434 ++ { "int", 0, "INT", 0x222b}, 1.435 ++ { "iota", 0, "i", 0x03b9}, 1.436 ++ { "iquest", LATIN1_iquest, "?", 0x00bf}, 1.437 + { "isin", 0, "(-" }, 1.438 +- { "iuml", LATIN1_iuml, "i\"" }, 1.439 +- { "kappa", 0, "k" }, 1.440 ++ { "iuml", LATIN1_iuml, "i\"", 0x00ef}, 1.441 ++ { "kappa", 0, "k", 0x03ba}, 1.442 + { "lArr", 0, "<=" }, 1.443 +- { "lambda", 0, "l" }, 1.444 ++ { "lambda", 0, "l", 0x03bb}, 1.445 + { "lang", 0, "</" }, 1.446 + { "laquo", LATIN1_laquo, "<<" }, 1.447 +- { "larr", 0, "<-" }, 1.448 ++ { "larr", 0, "<-", 0x2190}, 1.449 + // { "lceil", 0, "<|" }, 1.450 + { "ldquo", 0, "\"" }, 1.451 +- { "le", 0, "<=" }, 1.452 ++ { "le", 0, "<=", 0x2264}, 1.453 + // { "lfloor", 0, "|<" }, 1.454 + { "lowast", 0, "*" }, 1.455 + { "loz", 0, "<>" }, 1.456 + { "lsaquo", 0, "<" }, 1.457 + { "lsquo", 0, "`" }, 1.458 +- { "lt", 0, "<" }, 1.459 +- { "macr", LATIN1_macr, "-" }, 1.460 ++ { "lt", 0, "<", 0x003c}, 1.461 ++ { "macr", LATIN1_macr, "-", 0x00af}, 1.462 + { "mdash", 0, "--" }, 1.463 +- { "micro", LATIN1_micro, "my" }, 1.464 +- { "middot", LATIN1_middot, "." }, 1.465 +- { "minus", 0, "-" }, 1.466 +- { "mu", 0, "m" }, 1.467 ++ { "micro", LATIN1_micro, "my", 0x00b5}, 1.468 ++ { "middot", LATIN1_middot, ".", 0x00b7}, 1.469 ++ { "minus", 0, "-", 0x2212}, 1.470 ++ { "mu", 0, "m", 0x03bc}, 1.471 + { "nabla", 0, "Nabla" }, 1.472 +- { "nbsp", LATIN1_nbsp, " " }, 1.473 ++ { "nbsp", LATIN1_nbsp, " ", 0x00a0}, 1.474 + { "ndash", 0, "-" }, 1.475 +- { "ne", 0, "!=" }, 1.476 ++ { "ne", 0, "!=", 0x2260}, 1.477 + { "ni", 0, "-)" }, 1.478 + { "not", LATIN1_not, "NOT" }, 1.479 + { "notin", 0, "!(-" }, 1.480 + { "nsub", 0, "!(C" }, 1.481 +- { "ntilde", LATIN1_ntilde, "n~" }, 1.482 +- { "nu", 0, "n" }, 1.483 +- { "oacute", LATIN1_oacute, "o'" }, 1.484 +- { "ocirc", LATIN1_ocirc, "o^" }, 1.485 ++ { "ntilde", LATIN1_ntilde, "n~", 0x00f1}, 1.486 ++ { "nu", 0, "n", 0x03bd}, 1.487 ++ { "oacute", LATIN1_oacute, "o'", 0x00f3}, 1.488 ++ { "ocirc", LATIN1_ocirc, "o^", 0x00f4}, 1.489 + { "oelig", 0, "oe" }, 1.490 +- { "ograve", LATIN1_ograve, "o`" }, 1.491 ++ { "ograve", LATIN1_ograve, "o`", 0x00f2}, 1.492 + { "oline", LATIN1_macr, "-" }, 1.493 +- { "omega", 0, "o" }, 1.494 +- { "omicron", 0, "o" }, 1.495 ++ { "omega", 0, "o", 0x03c9}, 1.496 ++ { "omicron", 0, "o", 0x03bf}, 1.497 + { "oplus", 0, "(+)" }, 1.498 + { "or", 0, "OR" }, 1.499 +- { "ordf", LATIN1_ordf, "-a" }, 1.500 +- { "ordm", LATIN1_ordm, "-o" }, 1.501 +- { "oslash", LATIN1_oslash, "o/" }, 1.502 +- { "otilde", LATIN1_otilde, "o~" }, 1.503 ++ { "ordf", LATIN1_ordf, "-a", 0x00aa}, 1.504 ++ { "ordm", LATIN1_ordm, "-o", 0x00ba}, 1.505 ++ { "oslash", LATIN1_oslash, "o/", 0x00f8}, 1.506 ++ { "otilde", LATIN1_otilde, "o~", 0x00f5}, 1.507 + { "otimes", 0, "(x)" }, 1.508 +- { "ouml", LATIN1_ouml, "o\"" }, 1.509 +- { "para", LATIN1_para, "P:" }, 1.510 +- { "part", 0, "PART" }, 1.511 +- { "permil", 0, " 0/00" }, 1.512 ++ { "ouml", LATIN1_ouml, "o\"", 0x00f6}, 1.513 ++ { "para", LATIN1_para, "P:", 0x00b6}, 1.514 ++ { "part", 0, "PART",0x2202}, 1.515 ++ { "permil", 0, " 0/00",0x2030}, 1.516 + { "perp", 0, "-T" }, 1.517 +- { "phi", 0, "f" }, 1.518 +- { "pi", 0, "p" }, 1.519 ++ { "phi", 0, "f", 0x03c6}, 1.520 ++ { "pi", 0, "p", 0x03c0}, 1.521 + { "piv", 0, "Pi" }, 1.522 +- { "plusmn", LATIN1_plusmn, "+/-" }, 1.523 +- { "pound", LATIN1_pound, "-L-" }, 1.524 ++ { "plusmn", LATIN1_plusmn, "+/-", 0x00b1}, 1.525 ++ { "pound", LATIN1_pound, "-L-", 0x00a3}, 1.526 + { "prime", 0, "'" }, 1.527 +- { "prod", 0, "PROD" }, 1.528 ++ { "prod", 0, "PROD",0x220f}, 1.529 + { "prop", 0, "0(" }, 1.530 +- { "psi", 0, "ps" }, 1.531 ++ { "psi", 0, "ps", 0x03c8}, 1.532 + { "quot", 0, "\"" }, 1.533 + { "rArr", 0, "=>" }, 1.534 +- { "radic", 0, "SQRT" }, 1.535 ++ { "radic", 0, "SQRT",0x221a}, 1.536 + { "rang", 0, "/>" }, 1.537 + { "raquo", LATIN1_raquo, ">>" }, 1.538 +- { "rarr", 0, "->" }, 1.539 ++ { "rarr", 0, "->", 0x2192}, 1.540 + // { "rceil", 0, ">|" }, 1.541 + { "rdquo", 0, "\"" }, 1.542 + { "real", 0, "Re" }, 1.543 +- { "reg", LATIN1_reg, "(R)" }, 1.544 ++ { "reg", LATIN1_reg, "(R)", 0x00ae}, 1.545 + // { "rfloor", 0, "|>" }, 1.546 +- { "rho", 0, "r" }, 1.547 ++ { "rho", 0, "r", 0x03c1}, 1.548 + { "rsaquo", 0, ">" }, 1.549 + { "rsquo", 0, "'" }, 1.550 + { "sbquo", 0, "'" }, 1.551 +- { "scaron", 0, "s" }, 1.552 ++ { "scaron", 0, "s", 0x0161}, 1.553 + { "sdot", 0, "DOT" }, 1.554 +- { "sect", LATIN1_sect, "S:" }, 1.555 ++ { "sect", LATIN1_sect, "S:", 0x00a7}, 1.556 + { "shy", LATIN1_shy, "" }, 1.557 +- { "sigma", 0, "s" }, 1.558 +- { "sigmaf", 0, "s" }, 1.559 ++ { "sigma", 0, "s", 0x03c3}, 1.560 ++ { "sigmaf", 0, "s", 0x03c2}, 1.561 + { "sim", 0, "~" }, 1.562 + // { "spades", 0, "[spades]" }, 1.563 + { "sub", 0, "(C" }, 1.564 + { "sube", 0, "(_" }, 1.565 +- { "sum", 0, "SUM" }, 1.566 ++ { "sum", 0, "SUM", 0x2211}, 1.567 + { "sup", 0, ")C" }, 1.568 +- { "sup1", LATIN1_sup1, "^1" }, 1.569 +- { "sup2", LATIN1_sup2, "^2" }, 1.570 +- { "sup3", LATIN1_sup3, "^3" }, 1.571 ++ { "sup1", LATIN1_sup1, "^1", 0x00b9}, 1.572 ++ { "sup2", LATIN1_sup2, "^2", 0x00b2}, 1.573 ++ { "sup3", LATIN1_sup3, "^3", 0x00b3}, 1.574 + { "supe", 0, ")_" }, 1.575 +- { "szlig", LATIN1_szlig, "ss" }, 1.576 +- { "tau", 0, "t" }, 1.577 ++ { "szlig", LATIN1_szlig, "ss", 0x00df}, 1.578 ++ { "tau", 0, "t", 0x03c4}, 1.579 + { "there4", 0, ".:" }, 1.580 +- { "theta", 0, "th" }, 1.581 +- { "thorn", LATIN1_thorn, "th" }, 1.582 +- { "tilde", 0, "~" }, 1.583 +- { "times", LATIN1_times, "x" }, 1.584 +- { "trade", 0, "[TM]" }, 1.585 ++ { "theta", 0, "th", 0x03b8}, 1.586 ++ { "thorn", LATIN1_thorn, "th", 0x00fe}, 1.587 ++ { "tilde", 0, "~", 0x02dc}, 1.588 ++ { "times", LATIN1_times, "x", 0x00d7}, 1.589 ++ { "trade", 0, "[TM]",0x2122}, 1.590 + { "uArr", 0, "^^" }, 1.591 +- { "uacute", LATIN1_uacute, "u'" }, 1.592 ++ { "uacute", LATIN1_uacute, "u'", 0x00fa}, 1.593 + { "uarr", 0, "^" }, 1.594 +- { "ucirc", LATIN1_ucirc, "u^" }, 1.595 +- { "ugrave", LATIN1_ugrave, "u`" }, 1.596 +- { "uml", LATIN1_uml, "\"" }, 1.597 +- { "upsilon", 0, "u" }, 1.598 +- { "uuml", LATIN1_uuml, "u\"" }, 1.599 ++ { "ucirc", LATIN1_ucirc, "u^", 0x00fb}, 1.600 ++ { "ugrave", LATIN1_ugrave, "u`", 0x00f9}, 1.601 ++ { "uml", LATIN1_uml, "\"", 0x00a8}, 1.602 ++ { "upsilon", 0, "u", 0x03c5}, 1.603 ++ { "uuml", LATIN1_uuml, "u\"", 0x00fc}, 1.604 + { "weierp", 0, "P" }, 1.605 +- { "xi", 0, "x" }, 1.606 +- { "yacute", LATIN1_yacute, "y'" }, 1.607 +- { "yen", LATIN1_yen, "YEN" }, 1.608 +- { "yuml", LATIN1_yuml, "y\"" }, 1.609 +- { "zeta", 0, "z" }, 1.610 ++ { "xi", 0, "x", 0x03be}, 1.611 ++ { "yacute", LATIN1_yacute, "y'", 0x00fd}, 1.612 ++ { "yen", LATIN1_yen, "YEN", 0x00a5}, 1.613 ++ { "yuml", LATIN1_yuml, "y\"", 0x00ff}, 1.614 ++ { "zeta", 0, "z", 0x03b6}, 1.615 + }; 1.616 + 1.617 +-extern int use_iso8859; 1.618 ++extern int use_encoding; 1.619 + 1.620 + /* ------------------------------------------------------------------------- */ 1.621 + 1.622 ++char ubuf[4]; 1.623 ++ 1.624 ++char *mkutf(unsigned long x) 1.625 ++{ 1.626 ++ memset(ubuf, 0, 4); 1.627 ++ if(x < 128) ubuf[0] = x; 1.628 ++ else if(x < 0x800) { 1.629 ++ ubuf[0] = (0xc0 | ((x >> 6) & 0x1f)); 1.630 ++ ubuf[1] = (0x80 | (x & 0x3f)); 1.631 ++ } 1.632 ++ else { 1.633 ++ ubuf[0] = (0xe0 | ((x >> 12) & 0x0f)); 1.634 ++ ubuf[1] = (0x80 | ((x >> 6) & 0x3f)); 1.635 ++ ubuf[2] = (0x80 | (x & 0x3f)); 1.636 ++ } 1.637 ++ return ubuf; 1.638 ++} 1.639 ++ 1.640 + void 1.641 + replace_sgml_entities(string *s) 1.642 + { 1.643 +@@ -330,9 +349,9 @@ 1.644 + */ 1.645 + while (j < l && s->at(j) != '&') ++j; 1.646 + /* 1.647 +- * We could convert high-bit chars to "é" here if use_iso8859 1.648 +- * is off, then let them be translated or not. Is the purpose of 1.649 +- * !use_iso8859 to allow SGML entities to be seen, or to strongly 1.650 ++ * We could convert high-bit chars to "é" here if USE_ASCII 1.651 ++ * is on, then let them be translated or not. Is the purpose of 1.652 ++ * USE_ASCII to allow SGML entities to be seen, or to strongly 1.653 + * filter against high-ASCII chars that might blow up a terminal 1.654 + * that doesn't speak ISO8859? For the moment, "allow SGML entities 1.655 + * to be seen" -- no filtering here. 1.656 +@@ -370,7 +389,11 @@ 1.657 + if (!isdigit(c)) break; 1.658 + x = 10 * x + c - '0'; 1.659 + } 1.660 +- if (use_iso8859 || (x < 128)) { 1.661 ++ if (USE_UTF8) { 1.662 ++ s->replace(beg, j - beg, mkutf(x)); 1.663 ++ j = beg + 1; 1.664 ++ } 1.665 ++ else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) { 1.666 + s->replace(beg, j - beg, 1, (char) x); 1.667 + j = beg + 1; 1.668 + } else { 1.669 +@@ -408,13 +431,17 @@ 1.670 + (int (*)(const void *, const void *)) strcmp 1.671 + ); 1.672 + if (entity != NULL) { 1.673 +- if (use_iso8859 && entity->iso8859code) { 1.674 ++ if (USE_ISO8859 && entity->iso8859code) { 1.675 + s->replace(beg, j - beg, 1, (char) entity->iso8859code); 1.676 + j = beg + 1; 1.677 +- } else if (entity->asciistr) { 1.678 ++ } else if (USE_ASCII && entity->asciistr) { 1.679 + s->replace(beg, j - beg, entity->asciistr); 1.680 + j = beg + 1; 1.681 + } /* else don't replace it at all, we don't have a translation */ 1.682 ++ else if(USE_UTF8 && entity->unicode) { 1.683 ++ s->replace(beg, j - beg, mkutf(entity->unicode)); 1.684 ++ j = beg + 1; 1.685 ++ } 1.686 + } 1.687 + } else { 1.688 + ; /* EXTENSION: Allow literal '&' sometimes. */ 1.689 +diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C 1.690 +--- html2text-1.3.2a/table.C 2002-07-22 13:32:50.000000000 +0200 1.691 ++++ html2text-1.3.2a-patched/table.C 2005-05-13 22:19:59.871136320 +0200 1.692 +@@ -175,7 +175,7 @@ 1.693 + - (*number_of_columns_return - 1) * (column_spacing + 0), 1.694 + Area::LEFT // Yields better results than "p->halign"! 1.695 + )); 1.696 +- p->width = tmp.get() ? tmp->width() : 0; 1.697 ++ p->width = tmp.get() ? tmp->utf_width() : 0; 1.698 + } 1.699 + p->minimized = false; 1.700 + 1.701 +@@ -308,7 +308,7 @@ 1.702 + left_of_column + old_column_width - 1, 1.703 + Area::LEFT // Yields better results than "lc.halign"! 1.704 + )); 1.705 +- w = tmp->width(); 1.706 ++ w = tmp->utf_width(); 1.707 + if (w >= left_of_column + old_column_width) lc.minimized = true; 1.708 + } 1.709 + if (w > left_of_column + new_column_width) {