wok-current: 36ec463f0113 html2text/stuff/patch-utf8-html2text-1.3.2a.diff

wok-current view html2text/stuff/patch-utf8-html2text-1.3.2a.diff @ rev 23250

updated nsd (4.1.27 -> 4.3.0)

author	Hans-G?nter Theisgen
date	Wed Mar 25 14:08:08 2020 +0100 (2020-03-25)
parents
children

line source

1 diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C

2 --- html2text-1.3.2a/Area.C 2003-11-23 12:05:29.000000000 +0100

3 +++ html2text-1.3.2a-patched/Area.C 2005-05-13 22:19:59.862137688 +0200

4 @@ -36,10 +36,13 @@

5 #include <iostream>

7 #include "Area.h"

8 +#include "html.h"

9 #include "string.h"

11 #define LATIN1_nbsp 160

13 +extern int use_encoding;

14 +

15 /* ------------------------------------------------------------------------- */

17 #define malloc_array(type, size)\

18 @@ -81,6 +84,27 @@

20 /* ------------------------------------------------------------------------- */

22 +/* utf_length() and utf_width()

23 + *

24 + * Very simplified algorithm of calculating length of UTF-8

25 + * string. No check for errors. Counting only ASCII bytes and

26 + * leading bytes of UTF-8 multibyte sequences. All bytes like

27 + * 10xxxxxx are dropped. If USE_UTF8 is false then returns

28 + * usual length. --YS

29 + */

30 +

31 +unsigned int

32 +Line::utf_length(size_type f, size_type t) const

33 +{

34 + size_type m = (t < length_ ? t : length_);

35 + size_type r = m - f;

36 + if(USE_UTF8) {

37 + for (int i = f; i < m; i++)

38 + if((cells_[i].character & 0xc0) == 0x80) r--;

39 + }

40 + return r;

41 +}

42 +

43 void

44 Line::resize(size_type l)

45 {

46 @@ -236,6 +260,23 @@

47 return *this;

48 }

50 +unsigned int

51 +Area::utf_width()

52 +{

53 + size_type r = width_;

54 + if(USE_UTF8) { r = 0;

55 + for (size_type yy = 0; yy < height_; yy++) {

56 + size_type r1 = 0;

57 + for (int i = width_ - 1; i >= 0; i--) {

58 + if(!r1 && isspace(cells_[yy][i].character)) continue;

59 + if((cells_[yy][i].character & 0xc0) != 0x80) r1++;

60 + }

61 + if(r < r1) r = r1;

62 + }

63 + }

64 + return r;

65 +}

66 +

67 void

68 Area::resize(size_type w, size_type h)

69 {

70 @@ -439,7 +480,7 @@

71 char c = p->character;

72 char a = p->attribute;

74 - if (c == (char) LATIN1_nbsp) c = ' ';

75 + if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' ';

77 if (a == Cell::NONE) {

78 os << c;

79 Nur in html2text-1.3.2a-patched/: Area.C.orig.

80 diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h

81 --- html2text-1.3.2a/Area.h 2003-11-23 12:05:29.000000000 +0100

82 +++ html2text-1.3.2a-patched/Area.h 2005-05-13 22:19:59.863137536 +0200

83 @@ -81,6 +81,8 @@

84 Cell &operator[](size_type x) { return cells_[x]; }

85 const Cell *cells() const { return cells_; }

87 + unsigned int utf_length(size_type f, size_type t) const;

88 +

89 void resize(size_type l);

90 void enlarge(size_type l) { if (l > length_) resize(l); }

92 @@ -134,6 +136,8 @@

93 Cell *operator[](size_type y) { return cells_[y]; }

94 const Area &operator>>=(size_type rs);

96 + unsigned int utf_width();

97 +

98 void resize(size_type w, size_type h);

99 void enlarge(size_type w, size_type h);

100

101 Nur in html2text-1.3.2a-patched/: Area.h.orig.

102 diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C

103 --- html2text-1.3.2a/format.C 2003-11-23 12:05:29.000000000 +0100

104 +++ html2text-1.3.2a-patched/format.C 2005-05-13 22:19:59.865137232 +0200

105 @@ -1210,6 +1210,7 @@

106 }

107

108 Line::size_type to = from + 1;

109 + int to_from;

110

111 Line::size_type lbp = (Line::size_type) -1; // "Last break position".

112

113 @@ -1238,18 +1239,20 @@

114 to++;

115 }

116

117 - if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; }

118 + if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1)

119 + { to = lbp; break; }

120 }

121

122 + to_from = line.utf_length(from,to);

123 /*

124 * Copy the "from...to" range from the "line" to the bottom of the "res"

125 * Area.

126 */

127 Area::size_type x = 0;

128 Area::size_type len = to - from;

129 - if (halign == Area::LEFT || len >= w) { ; } else

130 - if (halign == Area::CENTER) { x += (w - len) / 2; } else

131 - if (halign == Area::RIGHT) { x += w - len; }

132 + if (halign == Area::LEFT || to_from >= w) { ; } else

133 + if (halign == Area::CENTER) { x += (w - to_from) / 2; } else

134 + if (halign == Area::RIGHT) { x += w - to_from; }

135 res->insert(line.cells() + from, len, x, res->height());

136

137 /*

138 Nur in html2text-1.3.2a-patched/: format.C.orig.

139 diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C

140 --- html2text-1.3.2a/html2text.C 2003-11-23 12:05:29.000000000 +0100

141 +++ html2text-1.3.2a-patched/html2text.C 2005-05-13 22:19:59.868136776 +0200

142 @@ -148,9 +148,10 @@

143 -o <file> Redirect output into <file>\n\

144 -nobs Do not use backspaces for boldface and underlining\n\

145 -ascii Use plain ASCII for output instead of ISO-8859-1\n\

146 + -utf8 Assume both terminal and input stream are in UTF-8 mode\n\

147 ";

148

149 -int use_iso8859 = 1;

150 +int use_encoding = ISO8859;

151

152 int

153 main(int argc, char **argv)

154 @@ -199,7 +200,8 @@

155 if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else

156 if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else

157 if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else

158 - if (!strcmp(arg, "-ascii" )) { use_iso8859 = false; } else

159 + if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else

160 + if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else

161 {

162 std::cerr

163 << "Unrecognized command line option \""

164 Nur in html2text-1.3.2a-patched/: html2text.C.orig.

165 diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h

166 --- html2text-1.3.2a/html.h 2001-10-04 22:03:54.000000000 +0200

167 +++ html2text-1.3.2a-patched/html.h 2005-05-13 22:19:59.866137080 +0200

168 @@ -61,6 +61,11 @@

169

170 /* ------------------------------------------------------------------------- */

171

172 +enum {ASCII, ISO8859, UTF8};

173 +#define USE_ISO8859 (use_encoding == ISO8859)

174 +#define USE_ASCII (use_encoding == ASCII)

175 +#define USE_UTF8 (use_encoding == UTF8)

176 +

177 #define LATIN1_nbsp 160

178 #define LATIN1_iexcl 161

179 #define LATIN1_cent 162

180 diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C

181 --- html2text-1.3.2a/sgml.C 2003-11-23 12:09:11.000000000 +0100

182 +++ html2text-1.3.2a-patched/sgml.C 2005-05-13 22:19:59.870136472 +0200

183 @@ -62,261 +62,280 @@

184 char name[8];

185 int iso8859code;

186 char *asciistr;

187 + unsigned long unicode;

188 } entities[] = {

189 - { "AElig", LATIN1_AElig, "AE" },

190 - { "AMP", 0, "&" },

191 - { "Aacute", LATIN1_Aacute, "A'" },

192 - { "Acirc", LATIN1_Acirc, "A^" },

193 - { "Agrave", LATIN1_Agrave, "A`" },

194 - { "Alpha", 0, "A" },

195 - { "Aring", LATIN1_Aring, "AA" },

196 - { "Atilde", LATIN1_Atilde, "A~" },

197 - { "Auml", LATIN1_Auml, "A\"" },

198 - { "Beta", 0, "B" },

199 - { "Ccedil", LATIN1_Ccedil, "C," },

200 - { "Chi", 0, "H" },

201 - { "Dagger", 0, "++" },

202 - { "Delta", 0, "D" },

203 - { "ETH", LATIN1_ETH, "D-" },

204 - { "Eacute", LATIN1_Eacute, "E'" },

205 - { "Ecirc", LATIN1_Ecirc, "E^" },

206 - { "Egrave", LATIN1_Egrave, "E`" },

207 - { "Epsilon", 0, "E" },

208 - { "Eta", 0, "E" },

209 - { "Euml", LATIN1_Euml, "E\"" },

210 - { "GT", 0, ">" },

211 - { "Gamma", 0, "G" },

212 - { "Iacute", LATIN1_Iacute, "I'" },

213 - { "Icirc", LATIN1_Icirc, "I^" },

214 - { "Igrave", LATIN1_Igrave, "I`" },

215 - { "Iota", 0, "I" },

216 - { "Iuml", LATIN1_Iuml, "I\"" },

217 - { "Kappa", 0, "K" },

218 - { "LT", 0, "<" },

219 - { "Lambda", 0, "L" },

220 - { "Mu", 0, "M" },

221 - { "Ntilde", LATIN1_Ntilde, "N~" },

222 - { "Nu", 0, "N" },

223 - { "OElig", 0, "OE" },

224 - { "Oacute", LATIN1_Oacute, "O'" },

225 - { "Ocirc", LATIN1_Ocirc, "O^" },

226 - { "Ograve", LATIN1_Ograve, "O`" },

227 - { "Omega", 0, "O" },

228 - { "Omicron", 0, "O" },

229 - { "Oslash", LATIN1_Oslash, "O/" },

230 - { "Otilde", LATIN1_Otilde, "O~" },

231 - { "Ouml", LATIN1_Ouml, "O\"" },

232 - { "Phi", 0, "F" },

233 - { "Pi", 0, "P" },

234 - { "Prime", 0, "''" },

235 - { "Psi", 0, "PS" },

236 - { "QUOT", 0, "\"" },

237 - { "Rho", 0, "R" },

238 - { "Scaron", 0, "S" },

239 - { "Sigma", 0, "S" },

240 - { "THORN", LATIN1_THORN, "TH" },

241 - { "Tau", 0, "T" },

242 - { "Theta", 0, "TH" },

243 - { "Uacute", LATIN1_Uacute, "U'" },

244 - { "Ucirc", LATIN1_Ucirc, "U^" },

245 - { "Ugrave", LATIN1_Ugrave, "U`" },

246 - { "Upsilon", 0, "U" },

247 - { "Uuml", LATIN1_Uuml, "U\"" },

248 - { "Xi", 0, "X" },

249 - { "Yacute", LATIN1_Yacute, "Y'" },

250 - { "Yuml", 0, "Y\"" },

251 - { "Zeta", 0, "Z" },

252 - { "aacute", LATIN1_aacute, "a'" },

253 - { "acirc", LATIN1_acirc, "a^" },

254 - { "acute", LATIN1_acute, "'" },

255 - { "aelig", LATIN1_aelig, "ae" },

256 - { "agrave", LATIN1_agrave, "a`" },

257 + { "AElig", LATIN1_AElig, "AE", 0x00c6},

258 + { "AMP", 0, "&", 0x0026},

259 + { "Aacute", LATIN1_Aacute, "A'", 0x00c1},

260 + { "Acirc", LATIN1_Acirc, "A^", 0x00c2},

261 + { "Agrave", LATIN1_Agrave, "A`", 0x00c0},

262 + { "Alpha", 0, "A", 0x0391},

263 + { "Aring", LATIN1_Aring, "AA", 0x00c5},

264 + { "Atilde", LATIN1_Atilde, "A~", 0x00c3},

265 + { "Auml", LATIN1_Auml, "A\"", 0x00c4},

266 + { "Beta", 0, "B", 0x0392},

267 + { "Ccedil", LATIN1_Ccedil, "C,", 0x00c7},

268 + { "Chi", 0, "H", 0x03a7},

269 + { "Dagger", 0, "++", 0x2020},

270 + { "Delta", 0, "D", 0x0394},

271 + { "ETH", LATIN1_ETH, "D-", 0x00d0},

272 + { "Eacute", LATIN1_Eacute, "E'", 0x00c9},

273 + { "Ecirc", LATIN1_Ecirc, "E^", 0x00ca},

274 + { "Egrave", LATIN1_Egrave, "E`", 0x00c8},

275 + { "Epsilon", 0, "E", 0x0395},

276 + { "Eta", 0, "E", 0x0397},

277 + { "Euml", LATIN1_Euml, "E\"", 0x00cb},

278 + { "GT", 0, ">", 0x003e},

279 + { "Gamma", 0, "G", 0x0393},

280 + { "Iacute", LATIN1_Iacute, "I'", 0x00cd},

281 + { "Icirc", LATIN1_Icirc, "I^", 0x00ce},

282 + { "Igrave", LATIN1_Igrave, "I`", 0x00cc},

283 + { "Iota", 0, "I", 0x0399},

284 + { "Iuml", LATIN1_Iuml, "I\"", 0x00cf},

285 + { "Kappa", 0, "K", 0x039a},

286 + { "LT", 0, "<", 0x003c},

287 + { "Lambda", 0, "L", 0x039b},

288 + { "Mu", 0, "M", 0x039c},

289 + { "Ntilde", LATIN1_Ntilde, "N~", 0x00d1},

290 + { "Nu", 0, "N", 0x039d},

291 + { "OElig", 0, "OE", 0x0152},

292 + { "Oacute", LATIN1_Oacute, "O'", 0x00d3},

293 + { "Ocirc", LATIN1_Ocirc, "O^", 0x00d4},

294 + { "Ograve", LATIN1_Ograve, "O`", 0x00d2},

295 + { "Omega", 0, "O", 0x03a9},

296 + { "Omicron", 0, "O", 0x039f},

297 + { "Oslash", LATIN1_Oslash, "O/", 0x00d8},

298 + { "Otilde", LATIN1_Otilde, "O~", 0x00d5},

299 + { "Ouml", LATIN1_Ouml, "O\"", 0x00d6},

300 + { "Phi", 0, "F", 0x03a6},

301 + { "Pi", 0, "P", 0x03a0},

302 + { "Prime", 0, "''", },

303 + { "Psi", 0, "PS", 0x03a8},

304 + { "QUOT", 0, "\"", },

305 + { "Rho", 0, "R", 0x03a1},

306 + { "Scaron", 0, "S", 0x0161},

307 + { "Sigma", 0, "S", 0x03a3},

308 + { "THORN", LATIN1_THORN, "TH", 0x00de},

309 + { "Tau", 0, "T", 0x03a4},

310 + { "Theta", 0, "TH", 0x0398},

311 + { "Uacute", LATIN1_Uacute, "U'", 0x00da},

312 + { "Ucirc", LATIN1_Ucirc, "U^", 0x00db},

313 + { "Ugrave", LATIN1_Ugrave, "U`", 0x00d9},

314 + { "Upsilon", 0, "U", 0x03a5},

315 + { "Uuml", LATIN1_Uuml, "U\"", 0x00dc},

316 + { "Xi", 0, "X", 0x039e},

317 + { "Yacute", LATIN1_Yacute, "Y'", 0x00dd},

318 + { "Yuml", 0, "Y\"", 0x0178},

319 + { "Zeta", 0, "Z", 0x0396},

320 + { "aacute", LATIN1_aacute, "a'", 0x00e1},

321 + { "acirc", LATIN1_acirc, "a^", 0x00e2},

322 + { "acute", LATIN1_acute, "'", 0x00b4},

323 + { "aelig", LATIN1_aelig, "ae", 0x00e6},

324 + { "agrave", LATIN1_agrave, "a`", 0x00e0},

325 { "alefsym", 0, "Aleph" },

326 - { "alpha", 0, "a" },

327 + { "alpha", 0, "a", 0x03b1},

328 { "amp", 0, "&" },

329 { "and", 0, "AND" },

330 { "ang", 0, "-V" },

331 { "apos", 0, "'" },

332 - { "aring", LATIN1_aring, "aa" },

333 - { "asymp", 0, "~=" },

334 - { "atilde", LATIN1_atilde, "a~" },

335 - { "auml", LATIN1_auml, "a\"" },

336 + { "aring", LATIN1_aring, "aa", 0x00e5},

337 + { "asymp", 0, "~=", 0x2248},

338 + { "atilde", LATIN1_atilde, "a~", 0x00e3},

339 + { "auml", LATIN1_auml, "a\"", 0x00e5},

340 { "bdquo", 0, "\"" },

341 - { "beta", 0, "b" },

342 - { "brvbar", LATIN1_brvbar, "|" },

343 - { "bull", 0, " o " },

344 + { "beta", 0, "b", 0x03b2},

345 + { "brvbar", LATIN1_brvbar, "|", 0x00a6},

346 + { "bull", 0, " o ", 0x2022},

347 { "cap", 0, "(U" },

348 - { "ccedil", LATIN1_ccedil, "c," },

349 - { "cedil", LATIN1_cedil, "," },

350 - { "cent", LATIN1_cent, "-c-" },

351 - { "chi", 0, "h" },

352 - { "circ", 0, "^" },

353 + { "ccedil", LATIN1_ccedil, "c,", 0x00e7},

354 + { "cedil", LATIN1_cedil, ",", 0x00b8},

355 + { "cent", LATIN1_cent, "-c-", 0x00a2},

356 + { "chi", 0, "h", 0x03c7},

357 + { "circ", 0, "^", 0x005e},

358 // { "clubs", 0, "[clubs]" },

359 { "cong", 0, "?=" },

360 - { "copy", LATIN1_copy, "(c)" },

361 + { "copy", LATIN1_copy, "(c)", 0x00a9},

362 { "crarr", 0, "<-'" },

363 { "cup", 0, ")U" },

364 - { "curren", LATIN1_curren, "CUR" },

365 + { "curren", LATIN1_curren, "CUR", 0x00a4},

366 { "dArr", 0, "vv" },

367 - { "dagger", 0, "+" },

368 + { "dagger", 0, "+", 0x2020},

369 { "darr", 0, "v" },

370 - { "deg", LATIN1_deg, "DEG" },

371 - { "delta", 0, "d" },

372 + { "deg", LATIN1_deg, "DEG", 0x00b0},

373 + { "delta", 0, "d", 0x03b4},

374 // { "diams", 0, "[diamonds]" },

375 - { "divide", LATIN1_divide, "/" },

376 - { "eacute", LATIN1_eacute, "e'" },

377 - { "ecirc", LATIN1_ecirc, "e^" },

378 - { "egrave", LATIN1_egrave, "e`" },

379 + { "divide", LATIN1_divide, "/", 0x00f7},

380 + { "eacute", LATIN1_eacute, "e'", 0x00e9},

381 + { "ecirc", LATIN1_ecirc, "e^", 0x00ea},

382 + { "egrave", LATIN1_egrave, "e`", 0x00e8},

383 { "empty", 0, "{}" },

384 - { "epsilon", 0, "e" },

385 - { "equiv", 0, "==" },

386 - { "eta", 0, "e" },

387 - { "eth", LATIN1_eth, "d-" },

388 - { "euml", LATIN1_euml, "e\"" },

389 - { "euro", 0, "EUR" },

390 + { "epsilon", 0, "e", 0x03b5},

391 + { "equiv", 0, "==", 0x2261},

392 + { "eta", 0, "e", 0x03b7},

393 + { "eth", LATIN1_eth, "d-", 0x00f0},

394 + { "euml", LATIN1_euml, "e\"", 0x00eb},

395 + { "euro", 0, "EUR", 0x20ac},

396 { "exist", 0, "TE" },

397 { "fnof", 0, "f" },

398 { "forall", 0, "FA" },

399 - { "frac12", LATIN1_frac12, " 1/2" },

400 - { "frac14", LATIN1_frac14, " 1/4" },

401 - { "frac34", LATIN1_frac34, " 3/4" },

402 + { "frac12", LATIN1_frac12, " 1/2",0x00bd},

403 + { "frac14", LATIN1_frac14, " 1/4",0x00bc},

404 + { "frac34", LATIN1_frac34, " 3/4",0x00be},

405 { "frasl", 0, "/" },

406 - { "gamma", 0, "g" },

407 - { "ge", 0, ">=" },

408 - { "gt", 0, ">" },

409 + { "gamma", 0, "g", 0x03b3},

410 + { "ge", 0, ">=", 0x2265},

411 + { "gt", 0, ">", 0x003e},

412 { "hArr", 0, "<=>" },

413 { "harr", 0, "<->" },

414 // { "hearts", 0, "[hearts]" },

415 - { "hellip", 0, "..." },

416 - { "iacute", LATIN1_iacute, "i'" },

417 - { "icirc", LATIN1_icirc, "i^" },

418 - { "iexcl", LATIN1_iexcl, "!" },

419 - { "igrave", LATIN1_igrave, "i`" },

420 + { "hellip", 0, "...", 0x2026},

421 + { "iacute", LATIN1_iacute, "i'", 0x00ed},

422 + { "icirc", LATIN1_icirc, "i^", 0x00ee},

423 + { "iexcl", LATIN1_iexcl, "!", 0x00a1},

424 + { "igrave", LATIN1_igrave, "i`", 0x00ec},

425 { "image", 0, "Im" },

426 - { "infin", 0, "oo" },

427 - { "int", 0, "INT" },

428 - { "iota", 0, "i" },

429 - { "iquest", LATIN1_iquest, "?" },

430 + { "infin", 0, "oo", 0x221e},

431 + { "int", 0, "INT", 0x222b},

432 + { "iota", 0, "i", 0x03b9},

433 + { "iquest", LATIN1_iquest, "?", 0x00bf},

434 { "isin", 0, "(-" },

435 - { "iuml", LATIN1_iuml, "i\"" },

436 - { "kappa", 0, "k" },

437 + { "iuml", LATIN1_iuml, "i\"", 0x00ef},

438 + { "kappa", 0, "k", 0x03ba},

439 { "lArr", 0, "<=" },

440 - { "lambda", 0, "l" },

441 + { "lambda", 0, "l", 0x03bb},

442 { "lang", 0, "</" },

443 { "laquo", LATIN1_laquo, "<<" },

444 - { "larr", 0, "<-" },

445 + { "larr", 0, "<-", 0x2190},

446 // { "lceil", 0, "<|" },

447 { "ldquo", 0, "\"" },

448 - { "le", 0, "<=" },

449 + { "le", 0, "<=", 0x2264},

450 // { "lfloor", 0, "|<" },

451 { "lowast", 0, "*" },

452 { "loz", 0, "<>" },

453 { "lsaquo", 0, "<" },

454 { "lsquo", 0, "`" },

455 - { "lt", 0, "<" },

456 - { "macr", LATIN1_macr, "-" },

457 + { "lt", 0, "<", 0x003c},

458 + { "macr", LATIN1_macr, "-", 0x00af},

459 { "mdash", 0, "--" },

460 - { "micro", LATIN1_micro, "my" },

461 - { "middot", LATIN1_middot, "." },

462 - { "minus", 0, "-" },

463 - { "mu", 0, "m" },

464 + { "micro", LATIN1_micro, "my", 0x00b5},

465 + { "middot", LATIN1_middot, ".", 0x00b7},

466 + { "minus", 0, "-", 0x2212},

467 + { "mu", 0, "m", 0x03bc},

468 { "nabla", 0, "Nabla" },

469 - { "nbsp", LATIN1_nbsp, " " },

470 + { "nbsp", LATIN1_nbsp, " ", 0x00a0},

471 { "ndash", 0, "-" },

472 - { "ne", 0, "!=" },

473 + { "ne", 0, "!=", 0x2260},

474 { "ni", 0, "-)" },

475 { "not", LATIN1_not, "NOT" },

476 { "notin", 0, "!(-" },

477 { "nsub", 0, "!(C" },

478 - { "ntilde", LATIN1_ntilde, "n~" },

479 - { "nu", 0, "n" },

480 - { "oacute", LATIN1_oacute, "o'" },

481 - { "ocirc", LATIN1_ocirc, "o^" },

482 + { "ntilde", LATIN1_ntilde, "n~", 0x00f1},

483 + { "nu", 0, "n", 0x03bd},

484 + { "oacute", LATIN1_oacute, "o'", 0x00f3},

485 + { "ocirc", LATIN1_ocirc, "o^", 0x00f4},

486 { "oelig", 0, "oe" },

487 - { "ograve", LATIN1_ograve, "o`" },

488 + { "ograve", LATIN1_ograve, "o`", 0x00f2},

489 { "oline", LATIN1_macr, "-" },

490 - { "omega", 0, "o" },

491 - { "omicron", 0, "o" },

492 + { "omega", 0, "o", 0x03c9},

493 + { "omicron", 0, "o", 0x03bf},

494 { "oplus", 0, "(+)" },

495 { "or", 0, "OR" },

496 - { "ordf", LATIN1_ordf, "-a" },

497 - { "ordm", LATIN1_ordm, "-o" },

498 - { "oslash", LATIN1_oslash, "o/" },

499 - { "otilde", LATIN1_otilde, "o~" },

500 + { "ordf", LATIN1_ordf, "-a", 0x00aa},

501 + { "ordm", LATIN1_ordm, "-o", 0x00ba},

502 + { "oslash", LATIN1_oslash, "o/", 0x00f8},

503 + { "otilde", LATIN1_otilde, "o~", 0x00f5},

504 { "otimes", 0, "(x)" },

505 - { "ouml", LATIN1_ouml, "o\"" },

506 - { "para", LATIN1_para, "P:" },

507 - { "part", 0, "PART" },

508 - { "permil", 0, " 0/00" },

509 + { "ouml", LATIN1_ouml, "o\"", 0x00f6},

510 + { "para", LATIN1_para, "P:", 0x00b6},

511 + { "part", 0, "PART",0x2202},

512 + { "permil", 0, " 0/00",0x2030},

513 { "perp", 0, "-T" },

514 - { "phi", 0, "f" },

515 - { "pi", 0, "p" },

516 + { "phi", 0, "f", 0x03c6},

517 + { "pi", 0, "p", 0x03c0},

518 { "piv", 0, "Pi" },

519 - { "plusmn", LATIN1_plusmn, "+/-" },

520 - { "pound", LATIN1_pound, "-L-" },

521 + { "plusmn", LATIN1_plusmn, "+/-", 0x00b1},

522 + { "pound", LATIN1_pound, "-L-", 0x00a3},

523 { "prime", 0, "'" },

524 - { "prod", 0, "PROD" },

525 + { "prod", 0, "PROD",0x220f},

526 { "prop", 0, "0(" },

527 - { "psi", 0, "ps" },

528 + { "psi", 0, "ps", 0x03c8},

529 { "quot", 0, "\"" },

530 { "rArr", 0, "=>" },

531 - { "radic", 0, "SQRT" },

532 + { "radic", 0, "SQRT",0x221a},

533 { "rang", 0, "/>" },

534 { "raquo", LATIN1_raquo, ">>" },

535 - { "rarr", 0, "->" },

536 + { "rarr", 0, "->", 0x2192},

537 // { "rceil", 0, ">|" },

538 { "rdquo", 0, "\"" },

539 { "real", 0, "Re" },

540 - { "reg", LATIN1_reg, "(R)" },

541 + { "reg", LATIN1_reg, "(R)", 0x00ae},

542 // { "rfloor", 0, "|>" },

543 - { "rho", 0, "r" },

544 + { "rho", 0, "r", 0x03c1},

545 { "rsaquo", 0, ">" },

546 { "rsquo", 0, "'" },

547 { "sbquo", 0, "'" },

548 - { "scaron", 0, "s" },

549 + { "scaron", 0, "s", 0x0161},

550 { "sdot", 0, "DOT" },

551 - { "sect", LATIN1_sect, "S:" },

552 + { "sect", LATIN1_sect, "S:", 0x00a7},

553 { "shy", LATIN1_shy, "" },

554 - { "sigma", 0, "s" },

555 - { "sigmaf", 0, "s" },

556 + { "sigma", 0, "s", 0x03c3},

557 + { "sigmaf", 0, "s", 0x03c2},

558 { "sim", 0, "~" },

559 // { "spades", 0, "[spades]" },

560 { "sub", 0, "(C" },

561 { "sube", 0, "(_" },

562 - { "sum", 0, "SUM" },

563 + { "sum", 0, "SUM", 0x2211},

564 { "sup", 0, ")C" },

565 - { "sup1", LATIN1_sup1, "^1" },

566 - { "sup2", LATIN1_sup2, "^2" },

567 - { "sup3", LATIN1_sup3, "^3" },

568 + { "sup1", LATIN1_sup1, "^1", 0x00b9},

569 + { "sup2", LATIN1_sup2, "^2", 0x00b2},

570 + { "sup3", LATIN1_sup3, "^3", 0x00b3},

571 { "supe", 0, ")_" },

572 - { "szlig", LATIN1_szlig, "ss" },

573 - { "tau", 0, "t" },

574 + { "szlig", LATIN1_szlig, "ss", 0x00df},

575 + { "tau", 0, "t", 0x03c4},

576 { "there4", 0, ".:" },

577 - { "theta", 0, "th" },

578 - { "thorn", LATIN1_thorn, "th" },

579 - { "tilde", 0, "~" },

580 - { "times", LATIN1_times, "x" },

581 - { "trade", 0, "[TM]" },

582 + { "theta", 0, "th", 0x03b8},

583 + { "thorn", LATIN1_thorn, "th", 0x00fe},

584 + { "tilde", 0, "~", 0x02dc},

585 + { "times", LATIN1_times, "x", 0x00d7},

586 + { "trade", 0, "[TM]",0x2122},

587 { "uArr", 0, "^^" },

588 - { "uacute", LATIN1_uacute, "u'" },

589 + { "uacute", LATIN1_uacute, "u'", 0x00fa},

590 { "uarr", 0, "^" },

591 - { "ucirc", LATIN1_ucirc, "u^" },

592 - { "ugrave", LATIN1_ugrave, "u`" },

593 - { "uml", LATIN1_uml, "\"" },

594 - { "upsilon", 0, "u" },

595 - { "uuml", LATIN1_uuml, "u\"" },

596 + { "ucirc", LATIN1_ucirc, "u^", 0x00fb},

597 + { "ugrave", LATIN1_ugrave, "u`", 0x00f9},

598 + { "uml", LATIN1_uml, "\"", 0x00a8},

599 + { "upsilon", 0, "u", 0x03c5},

600 + { "uuml", LATIN1_uuml, "u\"", 0x00fc},

601 { "weierp", 0, "P" },

602 - { "xi", 0, "x" },

603 - { "yacute", LATIN1_yacute, "y'" },

604 - { "yen", LATIN1_yen, "YEN" },

605 - { "yuml", LATIN1_yuml, "y\"" },

606 - { "zeta", 0, "z" },

607 + { "xi", 0, "x", 0x03be},

608 + { "yacute", LATIN1_yacute, "y'", 0x00fd},

609 + { "yen", LATIN1_yen, "YEN", 0x00a5},

610 + { "yuml", LATIN1_yuml, "y\"", 0x00ff},

611 + { "zeta", 0, "z", 0x03b6},

612 };

613

614 -extern int use_iso8859;

615 +extern int use_encoding;

616

617 /* ------------------------------------------------------------------------- */

618

619 +char ubuf[4];

620 +

621 +char *mkutf(unsigned long x)

622 +{

623 + memset(ubuf, 0, 4);

624 + if(x < 128) ubuf[0] = x;

625 + else if(x < 0x800) {

626 + ubuf[0] = (0xc0 | ((x >> 6) & 0x1f));

627 + ubuf[1] = (0x80 | (x & 0x3f));

628 + }

629 + else {

630 + ubuf[0] = (0xe0 | ((x >> 12) & 0x0f));

631 + ubuf[1] = (0x80 | ((x >> 6) & 0x3f));

632 + ubuf[2] = (0x80 | (x & 0x3f));

633 + }

634 + return ubuf;

635 +}

636 +

637 void

638 replace_sgml_entities(string *s)

639 {

640 @@ -330,9 +349,9 @@

641 */

642 while (j < l && s->at(j) != '&') ++j;

643 /*

644 - * We could convert high-bit chars to "é" here if use_iso8859

645 - * is off, then let them be translated or not. Is the purpose of

646 - * !use_iso8859 to allow SGML entities to be seen, or to strongly

647 + * We could convert high-bit chars to "é" here if USE_ASCII

648 + * is on, then let them be translated or not. Is the purpose of

649 + * USE_ASCII to allow SGML entities to be seen, or to strongly

650 * filter against high-ASCII chars that might blow up a terminal

651 * that doesn't speak ISO8859? For the moment, "allow SGML entities

652 * to be seen" -- no filtering here.

653 @@ -370,7 +389,11 @@

654 if (!isdigit(c)) break;

655 x = 10 * x + c - '0';

656 }

657 - if (use_iso8859 || (x < 128)) {

658 + if (USE_UTF8) {

659 + s->replace(beg, j - beg, mkutf(x));

660 + j = beg + 1;

661 + }

662 + else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) {

663 s->replace(beg, j - beg, 1, (char) x);

664 j = beg + 1;

665 } else {

666 @@ -408,13 +431,17 @@

667 (int (*)(const void *, const void *)) strcmp

668 );

669 if (entity != NULL) {

670 - if (use_iso8859 && entity->iso8859code) {

671 + if (USE_ISO8859 && entity->iso8859code) {

672 s->replace(beg, j - beg, 1, (char) entity->iso8859code);

673 j = beg + 1;

674 - } else if (entity->asciistr) {

675 + } else if (USE_ASCII && entity->asciistr) {

676 s->replace(beg, j - beg, entity->asciistr);

677 j = beg + 1;

678 } /* else don't replace it at all, we don't have a translation */

679 + else if(USE_UTF8 && entity->unicode) {

680 + s->replace(beg, j - beg, mkutf(entity->unicode));

681 + j = beg + 1;

682 + }

683 }

684 } else {

685 ; /* EXTENSION: Allow literal '&' sometimes. */

686 diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C

687 --- html2text-1.3.2a/table.C 2002-07-22 13:32:50.000000000 +0200

688 +++ html2text-1.3.2a-patched/table.C 2005-05-13 22:19:59.871136320 +0200

689 @@ -175,7 +175,7 @@

690 - (*number_of_columns_return - 1) * (column_spacing + 0),

691 Area::LEFT // Yields better results than "p->halign"!

692 ));

693 - p->width = tmp.get() ? tmp->width() : 0;

694 + p->width = tmp.get() ? tmp->utf_width() : 0;

695 }

696 p->minimized = false;

697

698 @@ -308,7 +308,7 @@

699 left_of_column + old_column_width - 1,

700 Area::LEFT // Yields better results than "lc.halign"!

701 ));

702 - w = tmp->width();

703 + w = tmp->utf_width();

704 if (w >= left_of_column + old_column_width) lc.minimized = true;

705 }

706 if (w > left_of_column + new_column_width) {

SliTaz Repositories

wok-current view html2text/stuff/patch-utf8-html2text-1.3.2a.diff @ rev 23250