You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cet.cc 11KB


  1. /*
  2. Character encoding transformation - basics
  3. Copyright (C) 2005-2008 Olaf Klein, o.b.klein@gpsbabel.org
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111 USA
  15. */
  16. #include "defs.h"
  17. #include "cet.h"
  18. /* ! ALL vec PARAMETERS HAVE TO BE A VALID POINTER TO A cet_cs_vec_t RECORD ! */
  19. /* =========================================================================== */
  20. /* %%% single character or value transmission %%% */
  21. /* --------------------------------------------------------------------------- */
  22. /* %%% cet_char_to_ucs4 %%%
  23. *
  24. * single character to UCS-4 code %%%
  25. * return values: 0 if convertable character, otherwise 1
  26. */
  27. int
  28. cet_char_to_ucs4(const char src, const cet_cs_vec_t* vec, int* value)
  29. {
  30. int trash, c;
  31. int* dest;
  32. c = ((unsigned char)src & 0xFF);
  33. dest = (value != NULL) ? value : &trash;
  34. *dest = c;
  35. c -= vec->ucs4_offset;
  36. if (c < 0) {
  37. return CET_SUCCESS;
  38. } else if ((c >= vec->ucs4_count) || (vec->ucs4_map[c] == -1)) {
  39. return CET_ERROR;
  40. } else {
  41. *dest = vec->ucs4_map[c];
  42. return CET_SUCCESS;
  43. }
  44. }
  45. /* %%% cet_ucs4_to_utf8 %%%
  46. *
  47. * convert single UCS-4 value into UTF-8 sequence
  48. *
  49. * return values: >= 0: length of produced UTF-8 sequence
  50. * < 0: -bytes more needed in target space
  51. */
  52. int
  53. cet_ucs4_to_utf8(char* dest, size_t dest_size, int value)
  54. {
  55. int result;
  56. unsigned char trash[16];
  57. unsigned char* c;
  58. c = (dest != NULL) ? (unsigned char*) dest : trash;
  59. if ((value & 0xffffff80) == 0) { /* <= 7 bits */
  60. if (dest_size < 1) {
  61. return (dest_size - 1);
  62. }
  63. *c++ = value;
  64. result = 1;
  65. } else if ((value & 0xfffff800) == 0) { /* <= 11 bits */
  66. if (dest_size < 2) {
  67. return (dest_size - 2);
  68. }
  69. *c++ = (0xc0 | (value >> 6));
  70. *c++ = (0x80 | (value & 0x3f));
  71. result = 2;
  72. } else if ((value & 0xffff0000) == 0) { /* <= 16 bits */
  73. if (dest_size < 3) {
  74. return (dest_size - 3);
  75. }
  76. *c++ = (0xe0 | (value >> 12));
  77. *c++ = (0x80 | ((value >> 6) & 0x3f));
  78. *c++ = (0x80 | (value & 0x3f));
  79. result = 3;
  80. } else if ((value & 0xffe00000) == 0) { /* <= 21 bits */
  81. if (dest_size < 4) {
  82. return (dest_size - 4);
  83. }
  84. *c++ = (0xf0 | (value >> 18));
  85. *c++ = (0x80 | ((value >> 12) & 0x3f));
  86. *c++ = (0x80 | ((value >> 6) & 0x3f));
  87. *c++ = (0x80 | (value & 0x3f));
  88. result = 4;
  89. } else if ((value & 0xfc000000) == 0) { /* <= 26 bits */
  90. if (dest_size < 5) {
  91. return (dest_size - 5);
  92. }
  93. *c++ = (0xf8 | (value >> 24));
  94. *c++ = (0x80 | ((value >> 18) & 0x3f));
  95. *c++ = (0x80 | ((value >> 12) & 0x3f));
  96. *c++ = (0x80 | ((value >> 6) & 0x3f));
  97. *c++ = (0x80 | (value & 0x3f));
  98. result = 5;
  99. } else if ((value & 0x80000000) == 0) { /* <= 31 bits */
  100. if (dest_size < 6) {
  101. return (dest_size - 6);
  102. }
  103. *c++ = (0xfc | (value >> 30));
  104. *c++ = (0x80 | ((value >> 24) & 0x3f));
  105. *c++ = (0x80 | ((value >> 18) & 0x3f));
  106. *c++ = (0x80 | ((value >> 12) & 0x3f));
  107. *c++ = (0x80 | ((value >> 6) & 0x3f));
  108. *c++ = (0x80 | (value & 0x3f));
  109. result = 6;
  110. } else {
  111. return 0; /* Value = -1 */
  112. }
  113. return result;
  114. }
  115. /* %%% cet_utf8_to_ucs4 %%%
  116. *
  117. * decode single UTF-8 sequence into UCS-4 value
  118. *
  119. * return values: 0 if success, otherwise 1
  120. */
  121. int
  122. cet_utf8_to_ucs4(const char* str, int* bytes, int* value)
  123. {
  124. unsigned char* cp = (unsigned char*)str;
  125. if (*cp < 0x80) {
  126. if (bytes != NULL) {
  127. *bytes = 1;
  128. }
  129. if (value != NULL) {
  130. *value = *cp;
  131. }
  132. return CET_SUCCESS;
  133. } else {
  134. unsigned char bits = 0xc0;
  135. unsigned char mask = 0xe0;
  136. int len = 0;
  137. for (len = 1; len <= 6; len++) { /* outer loop, test UTF-8 frame */
  138. if ((*cp & mask) == bits) {
  139. int i = len;
  140. while (i-- > 0) {
  141. cp++;
  142. if ((*cp & 0xc0) != 0x80) {
  143. break; /* invalid */
  144. } else if (i == 0) { /* all valid */
  145. char* c = (char*)str; /* found valid sequence, now storing value */
  146. int res = *c++ & (mask ^ 0xFF);
  147. i = len;
  148. while (i-- > 0) {
  149. res = (res << 6) | (*c++ & 0x3f);
  150. }
  151. if (bytes != NULL) {
  152. *bytes = len + 1;
  153. }
  154. if (value != NULL) {
  155. *value = res;
  156. }
  157. return CET_SUCCESS;
  158. }
  159. }
  160. }
  161. bits = (bits >> 1) | 0x80;
  162. mask = (mask >> 1) | 0x80;
  163. }
  164. }
  165. if (bytes != NULL) {
  166. *bytes = 1;
  167. }
  168. if (value != NULL) {
  169. *value = *cp;
  170. }
  171. return CET_ERROR; /* not valid */
  172. }
  173. /* %%% cet_ucs4_to_char %%%
  174. *
  175. * convert single UCS-4 value to original character from CS
  176. *
  177. * return values: coverted character or "CET_NOT_CONVERTABLE_DEFAULT"
  178. * if not possible
  179. */
  180. short
  181. cet_ucs4_to_char(const int value, const cet_cs_vec_t* vec)
  182. {
  183. cet_ucs4_link_t* link;
  184. if ((link = (cet_ucs4_link_t*)vec->ucs4_link)) {
  185. int i = 0;
  186. int j = vec->ucs4_links - 1; /* validate ucs value against vec */
  187. while (i <= j) {
  188. int a = (i + j) >> 1;
  189. int x = link[a].value;
  190. if (x < value) {
  191. i = a + 1;
  192. } else if (x > value) {
  193. j = a - 1;
  194. } else {
  195. return link[a].origin;
  196. }
  197. }
  198. }
  199. if ((link = (cet_ucs4_link_t*)vec->ucs4_extra)) { /* can be NULL */
  200. int i = 0;
  201. int j = vec->ucs4_extras - 1;
  202. while (i <= j) {
  203. int a = (i + j) >> 1;
  204. int x = link[a].value;
  205. if (x < value) {
  206. i = a + 1;
  207. } else if (x > value) {
  208. j = a - 1;
  209. } else {
  210. return link[a].origin;
  211. }
  212. }
  213. }
  214. if (value < vec->ucs4_offset + vec->ucs4_count) {
  215. return (char)value & 0xFF;
  216. } else {
  217. if (vec->fallback && (vec->fallback != vec)) {
  218. return cet_ucs4_to_char(value, vec->fallback);
  219. } else {
  220. return CET_NOT_CONVERTABLE_DEFAULT;
  221. }
  222. }
  223. }
  224. /* %%% cet_utf8_to_char %%%
  225. *
  226. * Convert single UTF-8 sequence directly into associated characted
  227. * by given character set.
  228. */
  229. short
  230. cet_utf8_to_char(const char* str, const cet_cs_vec_t* vec, /* out */ int* bytes, int* value)
  231. {
  232. int b, v;
  233. cet_utf8_to_ucs4(str, &b, &v); /* decode UTF-8 sequence */
  234. if (bytes != NULL) {
  235. *bytes = b;
  236. }
  237. if (value != NULL) {
  238. *value = v;
  239. }
  240. return cet_ucs4_to_char(v, vec);
  241. }
  242. /* =========================================================================== */
  243. /* %%% UTF-8 string manipulation functions %%% */
  244. /* =========================================================================== */
  245. /* %%% cet_utf8_strlen %%%
  246. *
  247. * Returns the number of valid (visible) characters.
  248. */
  249. unsigned int
  250. cet_utf8_strlen(const char* str)
  251. {
  252. if (str) {
  253. const char* cin = str;
  254. int len = 0;
  255. while (*cin) {
  256. int bytes, value;
  257. if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
  258. len++;
  259. }
  260. cin += bytes;
  261. }
  262. return len;
  263. } else {
  264. return 0;
  265. }
  266. }
  267. /* %%% cet_utf8_strdup %%%
  268. *
  269. * Checks and duplicates an UTF-8 string
  270. */
  271. char*
  272. cet_utf8_strdup(const char* str)
  273. {
  274. if (str) {
  275. return cet_utf8_strndup(str, strlen(str));
  276. } else {
  277. return NULL;
  278. }
  279. }
  280. /* %%% cet_utf8_strndup %%%
  281. *
  282. * Checks and duplicates an UTF-8 string
  283. */
  284. char*
  285. cet_utf8_strndup(const char* str, const int maxlen)
  286. {
  287. if (str) {
  288. const char* cin = str;
  289. char* res, *cout;
  290. int len = 0;
  291. res = cout = xstrdup(cin);
  292. while (*cin && (len < maxlen)) {
  293. int bytes, value;
  294. if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
  295. cout += cet_ucs4_to_utf8(cout, 6, value);
  296. len += 1;
  297. }
  298. cin += bytes;
  299. }
  300. *cout = '\0';
  301. if ((cin - str) != (cout - res)) {
  302. cout = xstrdup(res);
  303. xfree(res);
  304. res = cout;
  305. }
  306. return res;
  307. } else {
  308. return NULL;
  309. }
  310. }
  311. /* =========================================================================== */
  312. /* %%% full string transformation %%% */
  313. /* =========================================================================== */
  314. /* %%% cet_str_utf8_to_any %%%
  315. *
  316. * Converts a UTF-8 string to given character set
  317. */
  318. char*
  319. cet_str_utf8_to_any(const char* src, const cet_cs_vec_t* vec)
  320. {
  321. char* c = (char*)src;
  322. int len;
  323. char* res, *dest, *cend;
  324. if (c == NULL) {
  325. return NULL;
  326. }
  327. if (vec->ucs4_count == 0) {
  328. return xstrdup(src); /* UTF-8 -> UTF-8 */
  329. }
  330. len = strlen(c);
  331. res = dest = (char*) xmalloc(len + 1); /* target will become smaller or equal length */
  332. cend = c + len;
  333. while (c < cend) {
  334. int bytes;
  335. *dest++ = cet_utf8_to_char(c, vec, &bytes, NULL);
  336. c += bytes;
  337. }
  338. *dest = '\0';
  339. return res;
  340. }
  341. /* %%% cet_str_any_to_utf8 %%%
  342. *
  343. * Converts a string from given character set to UTF-8
  344. */
  345. char*
  346. cet_str_any_to_utf8(const char* src, const cet_cs_vec_t* vec)
  347. {
  348. int len, value;
  349. char* result, *cin, *cout;
  350. char temp = CET_NOT_CONVERTABLE_DEFAULT;
  351. cin = (char*)src;
  352. if (cin == NULL) {
  353. return NULL;
  354. }
  355. if (vec->ucs4_count == 0) {
  356. return xstrdup(src); /* UTF-8 -> UTF-8 */
  357. }
  358. len = 0;
  359. while (*cin != '\0') { /* determine length of resulting UTF-8 string */
  360. if (CET_ERROR == cet_char_to_ucs4(*cin++, vec, &value)) {
  361. cet_char_to_ucs4(temp, vec, &value);
  362. }
  363. len += cet_ucs4_to_utf8(NULL, 6, value);
  364. }
  365. result = cout = (char*) xmalloc(len + 1);
  366. cin = (char*)src;
  367. while (*cin != '\0') {
  368. if (CET_ERROR == cet_char_to_ucs4(*cin++, vec, &value)) {
  369. cet_char_to_ucs4(temp, vec, &value);
  370. }
  371. cout += cet_ucs4_to_utf8(cout, 6, value);
  372. }
  373. *cout = '\0';
  374. return result;
  375. }
  376. /* %%% cet_str_uni_to_utf8 %%%
  377. *
  378. * Converts an unicode string to UTF-8
  379. */
  380. char*
  381. cet_str_uni_to_utf8(const short* src, const int length)
  382. {
  383. int i, len;
  384. unsigned short* cin;
  385. char* res, *cout;
  386. if (src == NULL) {
  387. return NULL;
  388. }
  389. len = 0;
  390. i = length;
  391. cin = (unsigned short*)src;
  392. while (i-- > 0) {
  393. len += cet_ucs4_to_utf8(NULL, 6, le_read16(cin++));
  394. }
  395. res = cout = (char*) xmalloc(len + 1);
  396. cin = (unsigned short*)src;
  397. i = length;
  398. while (i-- > 0) {
  399. cout += cet_ucs4_to_utf8(cout, 6, le_read16(cin++));
  400. }
  401. *cout = '\0';
  402. return res;
  403. }
  404. /* %%% cet_str_any_to_uni %%%
  405. *
  406. * Converts a string in given character set to a 'wide string' (unicode)
  407. */
  408. short*
  409. cet_str_any_to_uni(const char* src, const cet_cs_vec_t* vec, int* length)
  410. {
  411. char* utf8;
  412. int len;
  413. short* res, *sout;
  414. if (! src) {
  415. utf8 = xstrdup("");
  416. } else if (vec->ucs4_count == 0) {
  417. utf8 = cet_utf8_strdup(src); /* UTF-8 -> clean UTF-8 */
  418. } else {
  419. utf8 = cet_str_any_to_utf8(src, vec);
  420. }
  421. len = cet_utf8_strlen(utf8);
  422. res = sout = (short int*) xcalloc(2, len + 1);
  423. if (len) {
  424. char* cin = utf8;
  425. while (*cin) {
  426. int bytes, value;
  427. if (CET_SUCCESS == cet_utf8_to_ucs4(cin, &bytes, &value)) {
  428. le_write16(sout, value);
  429. sout++;
  430. }
  431. cin += bytes;
  432. }
  433. }
  434. *sout = 0;
  435. if (length) {
  436. *length = len;
  437. }
  438. xfree(utf8);
  439. return res;
  440. }