/* Writing Qt .qm files. Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc. Written by Bruno Haible , 2003. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifdef HAVE_CONFIG_H # include #endif /* Specification. */ #include "write-qt.h" #include #include #include #include #include #include #include "error.h" #include "xerror.h" #include "message.h" #include "po-charset.h" #include "msgl-iconv.h" #include "hash-string.h" #include "unistr.h" #include "xalloc.h" #include "obstack.h" #include "hash.h" #include "binary-io.h" #include "fwriteerror.h" #include "gettext.h" #define _(str) gettext (str) /* Qt .qm files are read by the QTranslator::load() function and written by the Qt QTranslator::save() function. The Qt tool 'msg2qm' uses the latter function and can convert PO files to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's i18n.html documentation and therefore likely to disappear, we provide the same functionality here. The format of .qm files, as reverse engineered from the functions QTranslator::save(const QString& filename, SaveMode mode) QTranslator::squeeze(SaveMode mode) QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix) elfHash(const char* name) in qt-3.0.5, is as follows: It's a binary data format. Elements are u8 (byte), u16, u32. They are written in big-endian order. The file starts with a magic string of 16 bytes: 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD Then come three sections. Each of the three sections is optional. Each has this structure: struct { u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts u32 length; // number of bytes of the data u8 data[length]; }; In the first section, the hashes section, the data has the following structure: It's a sorted array of struct { u32 hashcode; // elfHash of the concatenation of msgid and // disambiguating-comment u32 offset; // offset within the data[] of the messages section }; It's sorted in ascending order by hashcode as primary sorting criteria and - when the hashcodes are the same - by offset as secondary criteria. In the second section, the messages section, the data has the following structure: It's a sequence of records, each representing a message, in no particular order. Each record is a sequence of subsections, each introduced by a particular subsection tag. The possible subsection tags are (and they usually occur in this order): - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format: struct { u32 length; u16 chars[length/2]; }; - 08: Disambiguating-comment. Followed by the NUL-terminated, ISO-8859-1 encoded, disambiguating-comment string: struct { u32 length; // number of bytes including the NUL at the end u8 chars[length]; }; - 06: SourceText, i.e. msgid. Followed by the NUL-terminated, ISO-8859-1 encoded, msgid: struct { u32 length; // number of bytes including the NUL at the end u8 chars[length]; }; - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually be ISO-8859-1. struct { u32 length; u16 chars[length/2]; }; This subsection tag is obsoleted by SourceText. - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded, context string (usually a C++ class name or empty): struct { u32 length; // number of bytes including the NUL at the end u8 chars[length]; }; - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1. struct { u32 length; u16 chars[length/2]; }; This subsection tag is obsoleted by Context. - 05: Hash. Followed by struct { u32 hashcode; // elfHash of the concatenation of msgid and // disambiguating-comment }; - 01: End. Designates the end of the record. No further data. Usually the following subsections are written, but some of them are optional: - 03: Translation. - 08: Disambiguating-comment (optional). - 06: SourceText (optional). - 07: Context (optional). - 05: Hash. - 01: End. A subsection can be omitted if the value to be output is the same as for the previous record. The third section, the contexts section, contains the set of all occurring context strings. This section is optional; it is used to speed up the search. The data is a hash table with the following structure: struct { u16 table_size; u16 buckets[table_size]; u8 pool[...]; }; pool[...] contains: u16 zero; for i = 0, ..., table_size: if there are context strings with elfHash(context)%table_size == i: for all context strings with elfHash(context)%table_size == i: len := min(length(context),255); // truncated to length 255 struct { u8 len; u8 chars[len]; }; struct { u8 zero[1]; // signals the end of this bucket u8 padding[0 or 1]; // padding for even number of bytes }; buckets[i] is 0 for an empty bucket, or the offset in pool[] where the context strings for this bucket start, divided by 2. This context section must not be used - if the empty context is used, or - if a context of length > 255 is used, or - if the context pool's size would be > 2^17. The elfHash function is the same as our hash_string function, except that at the end it maps a hash code of 0x00000000 to 0x00000001. When we convert from PO file format, all disambiguating-comments and contexts are empty, and therefore the contexts section can be omitted. */ /* Write a u8 (a single byte) to the output stream. */ static inline void write_u8 (FILE *output_file, unsigned char value) { putc (value, output_file); } /* Write a u16 (two bytes) to the output stream. */ static inline void write_u16 (FILE *output_file, unsigned short value) { unsigned char data[2]; data[0] = (value >> 8) & 0xff; data[1] = value & 0xff; fwrite (data, 2, 1, output_file); } /* Write a u32 (four bytes) to the output stream. */ static inline void write_u32 (FILE *output_file, unsigned int value) { unsigned char data[4]; data[0] = (value >> 24) & 0xff; data[1] = (value >> 16) & 0xff; data[2] = (value >> 8) & 0xff; data[3] = value & 0xff; fwrite (data, 4, 1, output_file); } #define obstack_chunk_alloc xmalloc #define obstack_chunk_free free /* Add a u8 (a single byte) to an obstack. */ static void append_u8 (struct obstack *mempool, unsigned char value) { unsigned char data[1]; data[0] = value; obstack_grow (mempool, data, 1); } /* Add a u16 (two bytes) to an obstack. */ static void append_u16 (struct obstack *mempool, unsigned short value) { unsigned char data[2]; data[0] = (value >> 8) & 0xff; data[1] = value & 0xff; obstack_grow (mempool, data, 2); } /* Add a u32 (four bytes) to an obstack. */ static void append_u32 (struct obstack *mempool, unsigned int value) { unsigned char data[4]; data[0] = (value >> 24) & 0xff; data[1] = (value >> 16) & 0xff; data[2] = (value >> 8) & 0xff; data[3] = value & 0xff; obstack_grow (mempool, data, 4); } /* Add an ISO-8859-1 encoded string to an obstack. */ static void append_base_string (struct obstack *mempool, const char *string) { size_t length = strlen (string) + 1; append_u32 (mempool, length); obstack_grow (mempool, string, length); } /* Add an UTF-16 encoded string to an obstack. */ static void append_unicode_string (struct obstack *mempool, const unsigned short *string, size_t length) { append_u32 (mempool, length * 2); for (; length > 0; string++, length--) append_u16 (mempool, *string); } /* Retrieve a 4-byte integer from memory. */ static inline unsigned int peek_u32 (const unsigned char *p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; } /* Convert an UTF-8 string to ISO-8859-1, without error checking. */ static char * conv_to_iso_8859_1 (const char *string) { size_t length = strlen (string); const char *str = string; const char *str_limit = string + length; /* Conversion to ISO-8859-1 can only reduce the number of bytes. */ char *result = XNMALLOC (length + 1, char); char *q = result; while (str < str_limit) { ucs4_t uc; str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); /* It has already been verified that the string fits in ISO-8859-1. */ if (!(uc < 0x100)) abort (); /* Store as ISO-8859-1. */ *q++ = (unsigned char) uc; } *q = '\0'; assert (q - result <= length); return result; } /* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16 codepoints) in *SIZEP. */ static unsigned short * conv_to_utf16 (const char *string, size_t *sizep) { size_t length = strlen (string); const char *str = string; const char *str_limit = string + length; /* Conversion to UTF-16 can at most double the number of bytes. */ unsigned short *result = XNMALLOC (length, unsigned short); unsigned short *q = result; while (str < str_limit) { ucs4_t uc; str += u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str); if (uc < 0x10000) /* UCS-2 character. */ *q++ = (unsigned short) uc; else { /* UTF-16 surrogate. */ *q++ = 0xd800 + ((uc - 0x10000) >> 10); *q++ = 0xdc00 + ((uc - 0x10000) & 0x3ff); } } assert (q - result <= 2 * length); *sizep = q - result; return result; } /* Return the Qt hash code of a string. */ static unsigned int string_hashcode (const char *str) { unsigned int h; h = hash_string (str); if (h == 0) h = 1; return h; } /* Compare two entries of the hashes section. */ static int cmp_hashes (const void *va, const void *vb) { const unsigned char *a = (const unsigned char *) va; const unsigned char *b = (const unsigned char *) vb; unsigned int a_hashcode = peek_u32 (a); unsigned int b_hashcode = peek_u32 (b); if (a_hashcode != b_hashcode) return (a_hashcode >= b_hashcode ? 1 : -1); else { unsigned int a_offset = peek_u32 (a + 4); unsigned int b_offset = peek_u32 (b + 4); if (a_offset != b_offset) return (a_offset >= b_offset ? 1 : -1); else return 0; } } /* Write a section to the output stream. */ static void write_section (FILE *output_file, unsigned char tag, void *data, size_t size) { /* A section can be omitted if it is empty. */ if (size > 0) { write_u8 (output_file, tag); write_u32 (output_file, size); fwrite (data, size, 1, output_file); } } /* Write an entire .qm file. */ static void write_qm (FILE *output_file, message_list_ty *mlp) { static unsigned char magic[16] = { 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95, 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD }; struct obstack hashes_pool; struct obstack messages_pool; size_t j; obstack_init (&hashes_pool); obstack_init (&messages_pool); /* Prepare the hashes section and the messages section. */ for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; /* No need to emit the header entry, it's not needed at runtime. */ if (!is_header (mp)) { char *msgctxt_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgctxt != NULL ? mp->msgctxt : ""); char *msgid_as_iso_8859_1 = conv_to_iso_8859_1 (mp->msgid); size_t msgstr_len; unsigned short *msgstr_as_utf16 = conv_to_utf16 (mp->msgstr, &msgstr_len); unsigned int hashcode = string_hashcode (msgid_as_iso_8859_1); unsigned int offset = obstack_object_size (&messages_pool); /* Add a record to the hashes section. */ append_u32 (&hashes_pool, hashcode); append_u32 (&hashes_pool, offset); /* Add a record to the messages section. */ append_u8 (&messages_pool, 0x03); append_unicode_string (&messages_pool, msgstr_as_utf16, msgstr_len); append_u8 (&messages_pool, 0x08); append_base_string (&messages_pool, ""); append_u8 (&messages_pool, 0x06); append_base_string (&messages_pool, msgid_as_iso_8859_1); append_u8 (&messages_pool, 0x07); append_base_string (&messages_pool, msgctxt_as_iso_8859_1); append_u8 (&messages_pool, 0x05); append_u32 (&messages_pool, hashcode); append_u8 (&messages_pool, 0x01); free (msgstr_as_utf16); free (msgid_as_iso_8859_1); free (msgctxt_as_iso_8859_1); } } /* Sort the hashes section. */ { size_t nstrings = obstack_object_size (&hashes_pool) / 8; if (nstrings > 0) qsort (obstack_base (&hashes_pool), nstrings, 8, cmp_hashes); } /* Write the magic number. */ fwrite (magic, sizeof (magic), 1, output_file); /* Write the hashes section. */ write_section (output_file, 0x42, obstack_base (&hashes_pool), obstack_object_size (&hashes_pool)); /* Write the messages section. */ write_section (output_file, 0x69, obstack_base (&messages_pool), obstack_object_size (&messages_pool)); /* Decide whether to write a contexts section. */ { bool can_write_contexts = true; for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; if (!is_header (mp)) if (mp->msgctxt == NULL || mp->msgctxt[0] == '\0' || strlen (mp->msgctxt) > 255) { can_write_contexts = false; break; } } if (can_write_contexts) { hash_table all_contexts; size_t num_contexts; unsigned long table_size; /* Collect the contexts, removing duplicates. */ hash_init (&all_contexts, 10); for (j = 0; j < mlp->nitems; j++) { message_ty *mp = mlp->item[j]; if (!is_header (mp)) hash_insert_entry (&all_contexts, mp->msgctxt, strlen (mp->msgctxt) + 1, NULL); } /* Compute the number of different contexts. */ num_contexts = all_contexts.size; /* Compute a suitable hash table size. */ table_size = next_prime (num_contexts * 1.7); if (table_size >= 0x10000) table_size = 65521; /* Put the contexts into a hash table of size table_size. */ { struct list_cell { const char *context; struct list_cell *next; }; struct list_cell *list_memory = XNMALLOC (table_size, struct list_cell); struct list_cell *freelist; struct bucket { struct list_cell *head; struct list_cell **tail; }; struct bucket *buckets = XNMALLOC (table_size, struct bucket); size_t i; freelist = list_memory; for (i = 0; i < table_size; i++) { buckets[i].head = NULL; buckets[i].tail = &buckets[i].head; } { void *iter; const void *key; size_t keylen; void *null; iter = NULL; while (hash_iterate (&all_contexts, &iter, &key, &keylen, &null) == 0) { const char *context = (const char *)key; i = string_hashcode (context) % table_size; freelist->context = context; freelist->next = NULL; *buckets[i].tail = freelist; buckets[i].tail = &freelist->next; freelist++; } } /* Determine the total context pool size. */ { size_t pool_size; pool_size = 2; for (i = 0; i < table_size; i++) if (buckets[i].head != NULL) { const struct list_cell *p; for (p = buckets[i].head; p != NULL; p = p->next) pool_size += 1 + strlen (p->context); pool_size++; if ((pool_size % 2) != 0) pool_size++; } if (pool_size <= 0x20000) { /* Prepare the contexts section. */ struct obstack contexts_pool; size_t pool_offset; obstack_init (&contexts_pool); append_u16 (&contexts_pool, table_size); pool_offset = 2; for (i = 0; i < table_size; i++) if (buckets[i].head != NULL) { const struct list_cell *p; append_u16 (&contexts_pool, pool_offset / 2); for (p = buckets[i].head; p != NULL; p = p->next) pool_offset += 1 + strlen (p->context); pool_offset++; if ((pool_offset % 2) != 0) pool_offset++; } else append_u16 (&contexts_pool, 0); if (!(pool_offset == pool_size)) abort (); append_u16 (&contexts_pool, 0); pool_offset = 2; for (i = 0; i < table_size; i++) if (buckets[i].head != NULL) { const struct list_cell *p; for (p = buckets[i].head; p != NULL; p = p->next) { append_u8 (&contexts_pool, strlen (p->context)); obstack_grow (&contexts_pool, p->context, strlen (p->context)); pool_offset += 1 + strlen (p->context); } append_u8 (&contexts_pool, 0); pool_offset++; if ((pool_offset % 2) != 0) { append_u8 (&contexts_pool, 0); pool_offset++; } } if (!(pool_offset == pool_size)) abort (); if (!(obstack_object_size (&contexts_pool) == 2 + 2 * table_size + pool_size)) abort (); /* Write the contexts section. */ write_section (output_file, 0x2f, obstack_base (&contexts_pool), obstack_object_size (&contexts_pool)); obstack_free (&contexts_pool, NULL); } } free (buckets); free (list_memory); } hash_destroy (&all_contexts); } } obstack_free (&messages_pool, NULL); obstack_free (&hashes_pool, NULL); } int msgdomain_write_qt (message_list_ty *mlp, const char *canon_encoding, const char *domain_name, const char *file_name) { FILE *output_file; /* If no entry for this domain don't even create the file. */ if (mlp->nitems != 0) { /* Determine whether mlp has plural entries. */ { bool has_plural; size_t j; has_plural = false; for (j = 0; j < mlp->nitems; j++) if (mlp->item[j]->msgid_plural != NULL) has_plural = true; if (has_plural) { multiline_error (xstrdup (""), xstrdup (_("\ message catalog has plural form translations\n\ but the Qt message catalog format doesn't support plural handling\n"))); return 1; } } /* Convert the messages to Unicode. */ iconv_message_list (mlp, canon_encoding, po_charset_utf8, NULL); /* Determine whether mlp has non-ISO-8859-1 msgctxt entries. */ { size_t j; for (j = 0; j < mlp->nitems; j++) { const char *string = mlp->item[j]->msgctxt; if (string != NULL) { /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all its bytes are < 0xc4. */ for (; *string; string++) if ((unsigned char) *string >= 0xc4) { multiline_error (xstrdup (""), xstrdup (_("\ message catalog has msgctxt strings containing characters outside ISO-8859-1\n\ but the Qt message catalog format supports Unicode only in the translated\n\ strings, not in the context strings\n"))); return 1; } } } } /* Determine whether mlp has non-ISO-8859-1 msgid entries. */ { size_t j; for (j = 0; j < mlp->nitems; j++) { const char *string = mlp->item[j]->msgid; /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all its bytes are < 0xc4. */ for (; *string; string++) if ((unsigned char) *string >= 0xc4) { multiline_error (xstrdup (""), xstrdup (_("\ message catalog has msgid strings containing characters outside ISO-8859-1\n\ but the Qt message catalog format supports Unicode only in the translated\n\ strings, not in the untranslated strings\n"))); return 1; } } } if (strcmp (domain_name, "-") == 0) { output_file = stdout; SET_BINARY (fileno (output_file)); } else { output_file = fopen (file_name, "wb"); if (output_file == NULL) { error (0, errno, _("error while opening \"%s\" for writing"), file_name); return 1; } } if (output_file != NULL) { write_qm (output_file, mlp); /* Make sure nothing went wrong. */ if (fwriteerror (output_file)) error (EXIT_FAILURE, errno, _("error while writing \"%s\" file"), file_name); } } return 0; }