/* * Copyright (C) 2020 Siara Logics (cc) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @author Arundale Ramanathan * */ /** * @file unishox2.h * @author Arundale Ramanathan, James Z. M. Gao * @brief API for Unishox2 Compression and Decompression * * This file describes each function of the Unishox2 API \n * For finding out how this API can be used in your program, \n * please see test_unishox2.c. */ #ifndef unishox2 #define unishox2 #define UNISHOX_VERSION "2.0" ///< Unicode spec version /** * Macro switch to enable/disable output buffer length parameter in low level api \n * Disabled by default \n * When this macro is defined, the all the API functions \n * except the simple API functions accept an additional parameter olen \n * that enables the developer to pass the size of the output buffer provided \n * so that the api function may not write beyond that length. \n * This can be disabled if the developer knows that the buffer provided is sufficient enough \n * so no additional parameter is passed and the program is faster since additional check \n * for output length is not performed at each step \n * The simple api, i.e. unishox2_(de)compress_simple will always omit the buffer length */ #ifndef UNISHOX_API_WITH_OUTPUT_LEN #define UNISHOX_API_WITH_OUTPUT_LEN 0 #endif /// Upto 8 bits of initial magic bit sequence can be included. Bit count can be specified with UNISHOX_MAGIC_BIT_LEN #ifndef UNISHOX_MAGIC_BITS #define UNISHOX_MAGIC_BITS 0xFF #endif /// Desired length of Magic bits defined by UNISHOX_MAGIC_BITS #ifdef UNISHOX_MAGIC_BIT_LEN #if UNISHOX_MAGIC_BIT_LEN < 0 || 9 <= UNISHOX_MAGIC_BIT_LEN #error "UNISHOX_MAGIC_BIT_LEN need between [0, 8)" #endif #else #define UNISHOX_MAGIC_BIT_LEN 1 #endif // enum {USX_ALPHA = 0, USX_SYM, USX_NUM, USX_DICT, USX_DELTA}; /// Default Horizontal codes. When composition of text is know beforehand, the other hcodes in this section can be used to achieve /// more compression. #define USX_HCODES_DFLT \ (const unsigned char[]) \ { \ 0x00, 0x40, 0x80, 0xC0, 0xE0 \ } /// Length of each default hcode #define USX_HCODE_LENS_DFLT \ (const unsigned char[]) \ { \ 2, 2, 2, 3, 3 \ } /// Horizontal codes preset for English Alphabet content only #define USX_HCODES_ALPHA_ONLY \ (const unsigned char[]) \ { \ 0x00, 0x00, 0x00, 0x00, 0x00 \ } /// Length of each Alpha only hcode #define USX_HCODE_LENS_ALPHA_ONLY \ (const unsigned char[]) \ { \ 0, 0, 0, 0, 0 \ } /// Horizontal codes preset for Alpha Numeric content only #define USX_HCODES_ALPHA_NUM_ONLY \ (const unsigned char[]) \ { \ 0x00, 0x00, 0x80, 0x00, 0x00 \ } /// Length of each Alpha numeric hcode #define USX_HCODE_LENS_ALPHA_NUM_ONLY \ (const unsigned char[]) \ { \ 1, 0, 1, 0, 0 \ } /// Horizontal codes preset for Alpha Numeric and Symbol content only #define USX_HCODES_ALPHA_NUM_SYM_ONLY \ (const unsigned char[]) \ { \ 0x00, 0x80, 0xC0, 0x00, 0x00 \ } /// Length of each Alpha numeric and symbol hcodes #define USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY \ (const unsigned char[]) \ { \ 1, 2, 2, 0, 0 \ } /// Horizontal codes preset favouring Alphabet content #define USX_HCODES_FAVOR_ALPHA \ (const unsigned char[]) \ { \ 0x00, 0x80, 0xA0, 0xC0, 0xE0 \ } /// Length of each hcode favouring Alpha content #define USX_HCODE_LENS_FAVOR_ALPHA \ (const unsigned char[]) \ { \ 1, 3, 3, 3, 3 \ } /// Horizontal codes preset favouring repeating sequences #define USX_HCODES_FAVOR_DICT \ (const unsigned char[]) \ { \ 0x00, 0x40, 0xC0, 0x80, 0xE0 \ } /// Length of each hcode favouring repeating sequences #define USX_HCODE_LENS_FAVOR_DICT \ (const unsigned char[]) \ { \ 2, 2, 3, 2, 3 \ } /// Horizontal codes preset favouring symbols #define USX_HCODES_FAVOR_SYM \ (const unsigned char[]) \ { \ 0x80, 0x00, 0xA0, 0xC0, 0xE0 \ } /// Length of each hcode favouring symbols #define USX_HCODE_LENS_FAVOR_SYM \ (const unsigned char[]) \ { \ 3, 1, 3, 3, 3 \ } //#define USX_HCODES_FAVOR_UMLAUT {0x00, 0x40, 0xE0, 0xC0, 0x80} //#define USX_HCODE_LENS_FAVOR_UMLAUT {2, 2, 3, 3, 2} /// Horizontal codes preset favouring umlaut letters #define USX_HCODES_FAVOR_UMLAUT \ (const unsigned char[]) \ { \ 0x80, 0xA0, 0xC0, 0xE0, 0x00 \ } /// Length of each hcode favouring umlaut letters #define USX_HCODE_LENS_FAVOR_UMLAUT \ (const unsigned char[]) \ { \ 3, 3, 3, 3, 1 \ } /// Horizontal codes preset for no repeating sequences #define USX_HCODES_NO_DICT \ (const unsigned char[]) \ { \ 0x00, 0x40, 0x80, 0x00, 0xC0 \ } /// Length of each hcode for no repeating sequences #define USX_HCODE_LENS_NO_DICT \ (const unsigned char[]) \ { \ 2, 2, 2, 0, 2 \ } /// Horizontal codes preset for no Unicode characters #define USX_HCODES_NO_UNI \ (const unsigned char[]) \ { \ 0x00, 0x40, 0x80, 0xC0, 0x00 \ } /// Length of each hcode for no Unicode characters #define USX_HCODE_LENS_NO_UNI \ (const unsigned char[]) \ { \ 2, 2, 2, 2, 0 \ } /// Default frequently occurring sequences. When composition of text is know beforehand, the other sequences in this section can /// be used to achieve more compression. #define USX_FREQ_SEQ_DFLT \ (const char *[]) \ { \ "\": \"", "\": ", "", "=\"", "\":\"", "://" \ } /// Frequently occurring sequences in text content #define USX_FREQ_SEQ_TXT \ (const char *[]) \ { \ " the ", " and ", "tion", " with", "ing", "ment" \ } /// Frequently occurring sequences in URL content #define USX_FREQ_SEQ_URL \ (const char *[]) \ { \ "https://", "www.", ".com", "http://", ".org", ".net" \ } /// Frequently occurring sequences in JSON content #define USX_FREQ_SEQ_JSON \ (const char *[]) \ { \ "\": \"", "\": ", "\",", "}}}", "\":\"", "}}" \ } /// Frequently occurring sequences in HTML content #define USX_FREQ_SEQ_HTML \ (const char *[]) \ { \ "", "=\"", "div", "href", "class", "
" \ } /// Frequently occurring sequences in XML content #define USX_FREQ_SEQ_XML \ (const char *[]) \ { \ "", "=\"", "\">", "