/* This file is part of the YAZ toolkit. * Copyright (C) Index Data * See the file LICENSE for details. */ /** * \file * \brief MARC-8 decoding * * MARC-8 reference: * http://www.loc.gov/marc/specifications/speccharmarc8.html */ #if HAVE_CONFIG_H #include #endif #include #include #include #include #include "iconv-p.h" struct decoder_data { int g0_mode; int g1_mode; int comb_offset; int comb_size; unsigned long comb_x[8]; size_t comb_no_read[8]; int control_mode; }; yaz_conv_func_t yaz_marc8_42_conv; yaz_conv_func_t yaz_marc8_45_conv; yaz_conv_func_t yaz_marc8_67_conv; yaz_conv_func_t yaz_marc8_62_conv; yaz_conv_func_t yaz_marc8_70_conv; yaz_conv_func_t yaz_marc8_32_conv; yaz_conv_func_t yaz_marc8_4E_conv; yaz_conv_func_t yaz_marc8_51_conv; yaz_conv_func_t yaz_marc8_33_conv; yaz_conv_func_t yaz_marc8_34_conv; yaz_conv_func_t yaz_marc8_53_conv; yaz_conv_func_t yaz_marc8_31_conv; static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, struct decoder_data *data, unsigned char *inp, size_t inbytesleft, size_t *no_read, int *comb); static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read) { struct decoder_data *data = (struct decoder_data *) d->data; unsigned long x; if (data->comb_offset < data->comb_size) { *no_read = data->comb_no_read[data->comb_offset]; x = data->comb_x[data->comb_offset]; /* special case for double-diacritic combining characters, INVERTED BREVE and DOUBLE TILDE. We'll increment the no_read counter by 1, since we want to skip over the processing of the closing ligature character */ /* this code is no longer necessary.. our handlers code in yaz_marc8_?_conv (generated by charconv.tcl) now returns 0 and no_read=1 when a sequence does not match the input. The SECOND HALFs in codetables.xml produces a non-existant entry in the conversion trie.. Hence when met, the input byte is skipped as it should (in yaz_iconv) */ #if 0 if (x == 0x0361 || x == 0x0360) *no_read += 1; #endif data->comb_offset++; return x; } data->comb_offset = 0; for (data->comb_size = 0; data->comb_size < 8; data->comb_size++) { int comb = 0; if (inbytesleft == 0 && data->comb_size) { yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); x = 0; *no_read = 0; break; } x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb); if (!comb || !x) break; data->comb_x[data->comb_size] = x; data->comb_no_read[data->comb_size] = *no_read; inp += *no_read; inbytesleft = inbytesleft - *no_read; } return x; } static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read) { struct decoder_data *data = (struct decoder_data *) d->data; unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read); if (x && data->comb_size == 1) { if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x)) { *no_read += data->comb_no_read[0]; data->comb_size = 0; } } return x; } static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, struct decoder_data *data, unsigned char *inp, size_t inbytesleft, size_t *no_read, int *comb) { *no_read = 0; while (inbytesleft > 0 && *inp == 27) { int *modep = &data->g0_mode; size_t inbytesleft0 = inbytesleft; inbytesleft--; inp++; if (inbytesleft == 0) goto incomplete; if (*inp == '$') /* set with multiple bytes */ { inbytesleft--; inp++; } if (inbytesleft == 0) goto incomplete; if (*inp == '(' || *inp == ',') /* G0 */ { inbytesleft--; inp++; } else if (*inp == ')' || *inp == '-') /* G1 */ { inbytesleft--; inp++; modep = &data->g1_mode; } if (inbytesleft == 0) goto incomplete; if (*inp == '!') /* ANSEL is a special case */ { inbytesleft--; inp++; } if (inbytesleft == 0) goto incomplete; *modep = *inp++; /* Final character */ inbytesleft--; (*no_read) += inbytesleft0 - inbytesleft; } if (inbytesleft == 0) return 0; else if (*inp == ' ') { *no_read += 1; return ' '; } else if (*inp < ' ' && data->control_mode) { *no_read += 1; return *inp; } else { unsigned long x; size_t no_read_sub = 0; int mode = *inp < 128 ? data->g0_mode : data->g1_mode; *comb = 0; switch(mode) { case 'B': /* Basic ASCII */ case 's': /* ASCII */ x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'E': /* ANSEL */ x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); break; case 'g': /* Greek */ x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'b': /* Subscripts */ x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'p': /* Superscripts */ x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '2': /* Basic Hebrew */ x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'N': /* Basic Cyrillic */ x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'Q': /* Extended Cyrillic */ x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '3': /* Basic Arabic */ x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '4': /* Extended Arabic */ x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case 'S': /* Greek */ x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; case '1': /* Chinese, Japanese, Korean (EACC) */ x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); break; default: *no_read = 0; yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ); return 0; } *no_read += no_read_sub; return x; } incomplete: *no_read = 0; yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); return 0; } static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read) { struct decoder_data *data = (struct decoder_data *) d->data; data->g0_mode = 'B'; data->g1_mode = 'E'; data->comb_offset = data->comb_size = 0; data->control_mode = 0; return 0; } static size_t init_marc8c(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read) { struct decoder_data *data = (struct decoder_data *) d->data; init_marc8(cd, d, inp, inbytesleft, no_read); data->control_mode = 1; return 0; } void destroy_marc8(yaz_iconv_decoder_t d) { struct decoder_data *data = (struct decoder_data *) d->data; xfree(data); } yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode, yaz_iconv_decoder_t d) { if (!yaz_matchstr(fromcode, "MARC8") || !yaz_matchstr(fromcode, "ANSEL")) { d->read_handle = read_marc8; d->init_handle = init_marc8; } else if (!yaz_matchstr(fromcode, "MARC8s")) { d->read_handle = read_marc8s; d->init_handle = init_marc8; } else if (!yaz_matchstr(fromcode, "MARC8c")) { d->read_handle = read_marc8; d->init_handle = init_marc8c; } else return 0; { struct decoder_data *data = (struct decoder_data *) xmalloc(sizeof(*data)); d->data = data; d->destroy_handle = destroy_marc8; } return d; } /* * Local variables: * c-basic-offset: 4 * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab */