rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 88a9d951aaf1e00b601fc287f9cdfd1aa90d0c45
parent ebec62e8b16d178054c7dcc44bb5748033b99f26
Author: robert <robertrussell.72001@gmail.com>
Date:   Mon, 11 Jul 2022 20:08:17 -0700

Initial commit

Diffstat:
A.gitignore | 2++
Aexample/opt.c | 38++++++++++++++++++++++++++++++++++++++
Ainclude/cext/all.h | 6++++++
Ainclude/cext/alloc.h | 37+++++++++++++++++++++++++++++++++++++
Ainclude/cext/cext.h | 14++++++++++++++
Ainclude/cext/def.h | 104+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainclude/cext/log.h | 11+++++++++++
Ainclude/cext/opt.h | 21+++++++++++++++++++++
Ainclude/cext/utf8.h | 21+++++++++++++++++++++
Ainclude/cext/vec.h | 102+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/alloc.c | 107+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/log.c | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/opt.c | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/str.c | 39+++++++++++++++++++++++++++++++++++++++
Asrc/utf8.c | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 746 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +*.o +*.a diff --git a/example/opt.c b/example/opt.c @@ -0,0 +1,38 @@ +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "cext/cext.h" +#include "cext/opt.h" + +int +main(int argc, char **argv) { + opt_ctx opt; + + opt_init(&opt, &argc, argv); + while (opt_parse(&opt)) { + if (opt.s == 'a') { + assert(opt.avail); + printf("short a: %s\n", opt_arg(&opt)); + } else if (opt.s == 'b') { + printf("short b\n"); + } else if (!strcmp(opt.l, "xxx")) { + assert(opt.avail); + printf("long xxx: %s\n", opt_arg(&opt)); + } else if (!strcmp(opt.l, "yyy")) { + assert(!opt.attached); + printf("long yyy\n"); + } else { + if (opt.s) + printf("unknown short opt: %c\n", opt.s); + else + printf("unknown long opt: %s\n", opt.l); + exit(1); + } + } + + printf("\n%d arguments:\n", argc-1); + for (size_t i = 1; i < argc; i++) + printf("\t%s\n", argv[i]); +} diff --git a/include/cext/all.h b/include/cext/all.h @@ -0,0 +1,6 @@ +#include "cext/alloc.h" +#include "cext/cext.h" +#include "cext/log.h" +#include "cext/opt.h" +#include "cext/utf8.h" +#include "cext/vec.h" diff --git a/include/cext/alloc.h b/include/cext/alloc.h @@ -0,0 +1,37 @@ +#pragma once + +#include "cext/def.h" + +/* A consistently-named set of memory allocators: {,e}{,re}alloc{,n,f}{,z} + * e- => allocation failures are fatal + * re- => realloc-style allocator + * -n => array allocator (with overflow check) + * -f => struct with flexible array member allocator (with overflow check) + * -z => new memory initialized to 0. + * All these allocators are interoperable with the stdlib allocators. */ +void *alloc(usize size); /* aka malloc */ +void *allocz(usize size); +void *allocn(usize len, usize size); +void *allocnz(usize len, usize size); /* aka calloc */ +void *allocf(usize hsize, usize flen, usize fsize); +void *allocfz(usize hsize, usize flen, usize fsize); +void *realloc(void *p, usize size); +void *reallocz(void *p, usize osize, usize nsize); +void *reallocn(void *p, usize len, usize size); +void *reallocnz(void *p, usize olen, usize nlen, usize size); +void *reallocf(void *p, usize hsize, usize flen, usize fsize); +void *reallocfz(void *p,usize hsize, usize oflen, usize nflen, usize fsize); +void *ealloc(usize size); +void *eallocz(usize size); +void *eallocn(usize len, usize size); +void *eallocnz(usize len, usize size); +void *eallocf(usize hsize, usize flen, usize fsize); +void *eallocfz(usize hsize, usize flen, usize fsize); +void *erealloc(void *p, usize size); +void *ereallocz(void *p, usize osize, usize nsize); +void *ereallocn(void *p, usize len, usize size); +void *ereallocnz(void *p, usize olen, usize nlen, usize size); +void *ereallocf(void *p, usize hsize, usize flen, usize fsize); +void *ereallocfz(void *p, usize hsize, usize oflen, usize nflen, usize fsize); + +void free(void *p); diff --git a/include/cext/cext.h b/include/cext/cext.h @@ -0,0 +1,14 @@ +#pragma once + +#include <inttypes.h> +#include <stddef.h> + +/* Standard headers that should be part of the language proper */ +#include <stdarg.h> +#include <stdbool.h> +#if __STDC_VERSION__ >= 201100L +#include <stdalign.h> +#include <stdnoreturn.h> +#endif + +#include "cext/def.h" diff --git a/include/cext/def.h b/include/cext/def.h @@ -0,0 +1,104 @@ +#pragma once + +#include "stddef.h" +#include "stdint.h" + +#define JOIN_AUX(a,b) a##b +#define JOIN(a,b) JOIN_AUX(a,b) + +#define LEN(a) (sizeof (a) / sizeof (a)[0]) +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; +typedef long long llong; +typedef unsigned long long ullong; + +#define I8_MIN INT8_MIN +#define I8_MAX INT8_MAX +#define I8_C INT8_C +typedef int8_t i8; + +#define I16_MIN INT16_MIN +#define I16_MAX INT16_MAX +#define I16_C INT16_C +typedef int16_t i16; + +#define I32_MIN INT32_MIN +#define I32_MAX INT32_MAX +#define I32_C INT32_C +typedef int32_t i32; + +#define I64_MIN INT64_MIN +#define I64_MAX INT64_MAX +#define I64_C INT64_C +typedef int64_t i64; + +#define IMAX_MIN INTMAX_MIN +#define IMAX_MAX INTMAX_MAX +#define IMAX_C INTMAX_C +typedef intmax_t imax; + +#define IPTR_MIN INTPTR_MIN +#define IPTR_MAX INTPTR_MAX +typedef intptr_t iptr; + +/* typedef ssize_t isize; */ + +#define U8_MAX UINT8_MAX +#define U8_C UINT8_C +typedef uint8_t u8; + +#define U16_MAX UINT16_MAX +#define U16_C UINT16_C +typedef uint16_t u16; + +#define U32_MAX UINT32_MAX +#define U32_C UINT32_C +typedef uint32_t u32; + +#define U64_MAX UINT64_MAX +#define U64_C UINT64_C +typedef uint64_t u64; + +#define UMAX_MAX UINTMAX_MAX +#define UMAX_C UINTMAX_C +typedef uintmax_t umax; + +#define UPTR_MAX UINTPTR_MAX +typedef uintptr_t uptr; + +#define USIZE_MAX SIZE_MAX +typedef size_t usize; + +#ifdef __SIZEOF_INT128__ +#define CEXT_HAVE_128 1 + +#define I128_MIN ((i128)-1 - I128_MAX) +#define I128_MAX ((i128)(U128_MAX >> 1)) +typedef __int128 i128; + +#define U128_MAX (((u128)U64_MAX << 64) | U64_MAX) +typedef unsigned __int128 u128; + +#endif + +#define RUNE_BAD RUNE_C(0xFFFD) +#define RUNE_MAX RUNE_C(0x10FFFF) +#define RUNE_C U32_C +typedef u32 rune; + +#if __STDC_VERSION__ >= 201100L +typedef max_align_t maxalign; +#else +/* Fallback which is probably correct */ +typedef struct { + intmax_t i; /* biggest integer */ + long double d; /* biggest floating point */ + void *p; /* data pointer */ + void (*f)(void); /* function pointer */ +} maxalign; +#endif diff --git a/include/cext/log.h b/include/cext/log.h @@ -0,0 +1,11 @@ +#pragma once + +#include <stdbool.h> + +#define infof(...) cext_log(__FILE__, __LINE__, 0, "INFO", "\x1b[32m", 0, __VA_ARGS__) +#define warnf(...) cext_log(__FILE__, __LINE__, 1, "WARN", "\x1b[33m", 0, __VA_ARGS__) +#define errorf(...) cext_log(__FILE__, __LINE__, 2, "ERROR", "\x1b[31m", 0, __VA_ARGS__) +#define fatalf(...) cext_log(__FILE__, __LINE__, 3, "FATAL", "\x1b[31m", 1, __VA_ARGS__) + +void cext_log_init(int color, bool log_time, bool log_loc, int min_level); +void cext_log(char *file, int line, int level, char *name, char *color, int code, char *fmt, ...); diff --git a/include/cext/opt.h b/include/cext/opt.h @@ -0,0 +1,21 @@ +#pragma once + +#include <stdbool.h> + +typedef struct opt_ctx { + char s, *l; /* Short/long opt; l is "" if s != 0 */ + bool avail; /* Argument for option is available */ + bool attached; /* Argument for option of form "-oARG" or "--opt=ARG" */ + + /* Internal */ + char *arg; + bool arg_used; + char *cluster; + int *argc; + char **o; + char **a; +} opt_ctx; + +void opt_init(opt_ctx *opt, int *argc, char **argv); +bool opt_parse(opt_ctx *opt); +char *opt_arg(opt_ctx *opt); diff --git a/include/cext/utf8.h b/include/cext/utf8.h @@ -0,0 +1,21 @@ +#pragma once + +#include "cext/def.h" + +#define UTF8_SIZE 4 + +/* Return the number of bytes needed to encode c, or 0 if c is an invalid + * codepoint. If s is nonnull, then it must have length >= utf8encode(0, c), + * which is guaranteed to be at most UTF8_SIZE; in this case, if c is a valid + * codepoint, then encode c into s. */ +usize utf8encode(char *s, rune c); + +/* Decode the first rune in s and return the number of consumed bytes. If this + * succeeds and c is nonnull, then set *c to the decoded rune. Otherwise, no + * valid rune is legally encoded as a prefix of s; in this case, set *c to + * RUNE_BAD if c is nonnull, and return n such that + * - n = 0 iff s is null or an incomplete prefix of a valid rune; + * - n > 0 iff the first min(n+1,slen) bytes of s are not a prefix of any + * valid rune (but if n < slen, then s[n] might be the first byte of a + * valid rune). */ +usize utf8decode(rune *c, char *s, usize slen); diff --git a/include/cext/vec.h b/include/cext/vec.h @@ -0,0 +1,102 @@ +#pragma once + +#include <string.h> + +#include "cext/alloc.h" +#include "cext/def.h" + +typedef struct { + usize len; + usize cap; + maxalign arr[]; +} vechdr; + +#define VECHDR(v) ((vechdr *)(v) - 1) + +/* Defaults */ +#define VEC_STATIC +#define METHOD(name, prefix) JOIN(JOIN(prefix,_),name) +#define VEC_MIN_CAP 8 +#define VEC_ALLOC ereallocf +#define VEC_FREE free + +#define VEC_DECLARE(T, ...)\ +VEC_STATIC void METHOD(free,##__VA_ARGS__)(T **v); \ +VEC_STATIC usize METHOD(len,##__VA_ARGS__)(T **v); \ +VEC_STATIC usize METHOD(cap,##__VA_ARGS__)(T **v); \ +VEC_STATIC int METHOD(resize,##__VA_ARGS__)(T **v, usize cap); \ +VEC_STATIC int METHOD(reserve,##__VA_ARGS__)(T **v, usize n); \ +VEC_STATIC int METHOD(ins,##__VA_ARGS__)(T **v, usize i, T e); \ +VEC_STATIC int METHOD(push,##__VA_ARGS__)(T **v, T e); \ +VEC_STATIC T METHOD(del,##__VA_ARGS__)(T **v, usize i); \ +VEC_STATIC T METHOD(pop,##__VA_ARGS__)(T **v); + +#define VEC_DEFINE(T, ...)\ +void METHOD(free,##__VA_ARGS__)(T **v) { \ + if (*v) \ + VEC_FREE(VECHDR(*v)); \ + *v = 0; \ +} \ +usize METHOD(len,##__VA_ARGS__)(T **v) { \ + return *v ? VECHDR(*v)->len : 0; \ +} \ +usize METHOD(cap,##__VA_ARGS__)(T **v) { \ + return *v ? VECHDR(*v)->cap : 0; \ +} \ +int METHOD(resize,##__VA_ARGS__)(T **v, usize cap) { \ + if (cap == 0) { \ + METHOD(free,##__VA_ARGS__)(v); \ + } else { \ + cap = MAX(cap, VEC_MIN_CAP); \ + vechdr *h = *v ? VECHDR(*v) : 0; \ + h = VEC_ALLOC(h, sizeof *h, cap, sizeof (*v)[0]); \ + if (!h) \ + return -1; \ + h->len = MIN(h->len, cap); \ + h->cap = cap; \ + *v = (void *)(h + 1); \ + } \ + return 0; \ +} \ +int METHOD(reserve,##__VA_ARGS__)(T **v, usize n) { \ + vechdr *h = *v ? VECHDR(*v) : 0; \ + usize rem = h ? h->cap - h->len : 0; \ + if (n > rem) { \ + usize need = n - rem; \ + usize cap = h ? h->cap + MAX(h->cap, need) : need; \ + return METHOD(resize,##__VA_ARGS__)(v, cap); \ + } else { \ + return 0; \ + } \ +} \ +int METHOD(ins,##__VA_ARGS__)(T **v, usize i, T e) { \ + if (METHOD(reserve,##__VA_ARGS__)(v, 1)) \ + return -1; \ + memmove(&(*v)[i+1], &(*v)[i], (VECHDR(*v)->len - i) * sizeof (*v)[0]); \ + (*v)[i] = e; \ + VECHDR(*v)->len++; \ + return 0; \ +} \ +int METHOD(push,##__VA_ARGS__)(T **v, T e) { \ + return METHOD(ins,##__VA_ARGS__)(v, METHOD(len,##__VA_ARGS__)(v), e); \ +} \ +T METHOD(del,##__VA_ARGS__)(T **v, usize i) { \ + T e = (*v)[i]; \ + memmove(&(*v)[i], &(*v)[i+1], (VECHDR(*v)->len - i - 1) * sizeof (*v)[0]); \ + VECHDR(*v)->len--; \ + return e; \ +} \ +T METHOD(pop,##__VA_ARGS__)(T **v) { \ + return METHOD(del,##__VA_ARGS__)(v, VECHDR(*v)->len - 1); \ +} + +/* TODO? +insn/insnz +deln +clr => set length to 0 without resizing +dup => duplicate/clone vector +optionally take cmp function and define: + sort => qsort wrapper + bsearch => bsearch wrapper + lsearch => linear search on unsorted array +*/ diff --git a/src/alloc.c b/src/alloc.c @@ -0,0 +1,107 @@ +#include <stdlib.h> +#include <string.h> + +#include "cext/cext.h" +#include "cext/alloc.h" +#include "cext/log.h" + +/* If s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW, then s1*s2 <= USIZE_MAX + * (but not conversely). This lets us avoid division in overflow checks. */ +#define MUL_NO_OVERFLOW ((usize) 1 << (sizeof(usize) * 4)) + +static bool +mul_will_overflow(usize a, usize b) { + return (a >= MUL_NO_OVERFLOW || b >= MUL_NO_OVERFLOW) + && a > 0 && USIZE_MAX/a < b; +} + +#define FAIL_OVERFLOW() fatalf("allocation failure: overflow") + +/* Compute a + b*c with overflow checks. */ +static usize +addmul(usize a, usize b, usize c) { + if (mul_will_overflow(b, c)) FAIL_OVERFLOW(); + usize bc = b * c; + if (a > USIZE_MAX - bc) FAIL_OVERFLOW(); + return a + bc; +} + +void * +alloc(usize size) { + return malloc(size); +} + +void * +allocz(usize size) { + return calloc(1, size); +} + +void * +allocn(usize len, usize size) { + return allocf(0, len, size); +} + +void * +allocnz(usize len, usize size) { + return calloc(len, size); +} + +void * +allocf(usize hsize, usize flen, usize fsize) { + return alloc(addmul(hsize, flen, fsize)); +} + +void * +allocfz(usize hsize, usize flen, usize fsize) { + return allocz(addmul(hsize, flen, fsize)); +} + +void * +reallocz(void *p, usize osize, usize nsize) { + p = realloc(p, nsize); + if (p && nsize > osize) + memset((char *) p + osize, 0, nsize - osize); + return p; +} + +void * +reallocn(void *p, usize len, usize size) { + return reallocf(p, 0, len, size); +} + +void * +reallocnz(void *p, usize olen, usize nlen, usize size) { + return reallocfz(p, 0, olen, nlen, size); +} + +void * +reallocf(void *p, usize hsize, usize flen, usize fsize) { + return realloc(p, addmul(hsize, flen, fsize)); +} + +void * +reallocfz(void *p, usize hsize, usize oflen, usize nflen, usize fsize) { + return reallocz(p, hsize + oflen*fsize, addmul(hsize, nflen, fsize)); +} + +#define EALLOC(name, ...)\ + void *e##name(__VA_ARGS__) {\ + void *q = name(EALLOC_AUX +#define EALLOC_AUX(...)\ + __VA_ARGS__);\ + if (!q) fatalf("allocation failure");\ + return q;\ + } + +EALLOC(alloc, usize size)(size) +EALLOC(allocz, usize size)(size) +EALLOC(allocn, usize len, usize size)(len, size) +EALLOC(allocnz, usize len, usize size)(len, size) +EALLOC(allocf, usize hsize, usize flen, usize fsize)(hsize, flen, fsize) +EALLOC(allocfz, usize hsize, usize flen, usize fsize)(hsize, flen, fsize) +EALLOC(realloc, void *p, usize size)(p, size) +EALLOC(reallocz, void *p, usize osize, usize nsize)(p, osize, nsize) +EALLOC(reallocn, void *p, usize len, usize size)(p, len, size) +EALLOC(reallocnz, void *p, usize olen, usize nlen, usize size)(p, olen, nlen, size) +EALLOC(reallocf, void *p, usize hsize, usize flen, usize fsize)(p, hsize, flen, fsize) +EALLOC(reallocfz, void *p, usize hsize, usize oflen, usize nflen, usize fsize)(p, hsize, oflen, nflen, fsize) diff --git a/src/log.c b/src/log.c @@ -0,0 +1,79 @@ +#define _POSIX_C_SOURCE 199506L +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <unistd.h> + +#include "cext/cext.h" +#include "cext/log.h" + +#define ISO8601_SIZE (sizeof "YYYY-MM-DDThh:mm:ssZ") +#define SGR(c) (use_color ? (c) : "") +#define RESET "\x1b[m" +#define BOLD "\x1b[1m" +#define FAINT "\x1b[2m" + +static bool use_color = false; +static bool log_time = false; +static bool log_loc = false; +static int min_level = 0; + +void +cext_log_init(int color, bool log_time_, bool log_loc_, int min_level_) { + if (color > 0) { /* force on */ + use_color = true; + } else if (color < 0) { /* force off */ + use_color = false; + } else { /* detect */ + char *no_color = getenv("NO_COLOR"); /* https://no-color.org */ + use_color = isatty(fileno(stderr)) + && !(no_color && no_color[0] != '\0'); + } + log_time = log_time_; + log_loc = log_loc_; + min_level = min_level_; +} + +static char * +iso8601(char *buf, time_t t) { + struct tm tm; + gmtime_r(&t, &tm); + if (!strftime(buf, ISO8601_SIZE, "%Y-%m-%dT%H:%M:%SZ", &tm)) + buf[0] = '\0'; /* strftime buffer contents are undefined on failure */ + return buf; +} + +void +cext_log( + char *file, int line, + int level, char *name, char *color, + int code, + char *fmt, ... +) { + if (level < min_level) + return; + + char stamp[ISO8601_SIZE]; + if (log_time) + iso8601(stamp, time(0)); /* Do this before locking stderr */ + + /* Lock stderr so other threads' IO does not get interleaved with ours. */ + flockfile(stderr); + + if (log_time) + fprintf(stderr, "%s%s%s ", SGR(FAINT), stamp, SGR(RESET)); + fprintf(stderr, "%s%s%-5s%s ", SGR(BOLD), SGR(color), name, SGR(RESET)); + if (log_loc && file) + fprintf(stderr, "%s%s:%d%s ", SGR(FAINT), file, line, SGR(RESET)); + + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + fputc('\n', stderr); + + funlockfile(stderr); + + if (code) + exit(code); +} diff --git a/src/opt.c b/src/opt.c @@ -0,0 +1,87 @@ +#include <assert.h> +#include <string.h> + +#include "cext/cext.h" +#include "cext/opt.h" + +void +opt_init(opt_ctx *opt, int *argc, char **argv) { + opt->arg_used = false; + opt->cluster = 0; + opt->argc = argc; + opt->o = &argv[1]; + opt->a = &argv[1]; +} + +bool +opt_parse(opt_ctx *opt) { + if (opt->arg_used) { + if (opt->attached) { + opt->cluster = 0; + } else { + opt->o++; + (*opt->argc)--; + } + opt->arg_used = false; + } + + if (opt->cluster) { + if ((opt->s = *++opt->cluster)) + goto found_opt; + opt->cluster = 0; + } + + bool skip = false; + for (; *opt->o; opt->o++) { + if (skip || (*opt->o)[0] != '-' || (*opt->o)[1] == '\0') { + /* Got an argument */ + *opt->a++ = *opt->o; + continue; + } + (*opt->argc)--; + if ((*opt->o)[1] == '-' && (*opt->o)[2] == '\0') { + /* Got "--", so everything else is an argument */ + skip = true; + continue; + } + /* Got an option */ + if ((*opt->o)[1] == '-') { /* Long */ + opt->s = 0; + opt->l = &(*opt->o)[2]; + } else { /* Short */ + opt->cluster = &(*opt->o)[1]; + opt->s = *opt->cluster; + opt->l = ""; + } + opt->o++; + goto found_opt; + } + + /* End of option parsing */ + *opt->a = 0; + return false; + +found_opt: + /* Find argument for option */ + opt->arg = 0; + if ((opt->arg = strchr(opt->l, '='))) { /* "--opt=ARG" */ + *opt->arg++ = '\0'; /* Null-terminate opt->l */ + opt->attached = true; + } else if (opt->s && opt->cluster[1] != '\0') { /* "-oARG" */ + opt->arg = &opt->cluster[1]; + opt->attached = true; + } else { /* "-o ARG" or "--opt ARG" or nothing */ + opt->arg = *opt->o; + opt->attached = false; + } + opt->avail = !!opt->arg; + + return true; +} + +char * +opt_arg(opt_ctx *opt) { + assert(opt->avail); + opt->arg_used = true; + return opt->arg; +} diff --git a/src/str.c b/src/str.c @@ -0,0 +1,39 @@ +/* TODO */ + +int +vaprintf(char **s, const char *fmt, va_list args) { + va_list args2; + va_copy(args2, args); + int len = vsnprintf(0, 0, fmt, args2); + va_end(args2); + if (len < 0) + return len; + + char *buf = alloc(len+1); + if (!buf) + return -1; + + int ret = vsnprintf(buf, len+1, fmt, args); + if (ret < 0) + free(buf); + else + *s = buf; + + return ret; +} + +int +aprintf(char **s, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + int ret = vaprintf(s, fmt, args); + va_end(args); + return ret; +} + +char * +estrdup(char *s) { + char *dup = strdup(s); + if (!dup) CEXT_ALLOC_FAIL(false); + return dup; +} diff --git a/src/utf8.c b/src/utf8.c @@ -0,0 +1,78 @@ +#include "cext/cext.h" +#include "cext/utf8.h" + +#define SURROGATE_MIN 0xD800 +#define SURROGATE_MAX 0xDFFF + +static const uchar utf8byte[] = { 0x0, 0xC0, 0xE0, 0xF0}; +static const uchar utf8mask[] = {0x80, 0xE0, 0xF0, 0xF8}; +static const rune utf8min[] = { 0x0, 0x80, 0x800, 0x10000}; +static const rune utf8max[] = {0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + +static bool +utf8overlong(rune c, usize len) { + return c < utf8min[len-1]; +} + +static bool +utf8encodable(rune c) { + return c <= RUNE_MAX && (c < SURROGATE_MIN || c > SURROGATE_MAX); +} + +static usize +utf8len(rune c) { + if (!utf8encodable(c)) + return 0; + + usize len = 1; + while (c > utf8max[len-1]) + len++; + return len; +} + +usize +utf8encode(char *s, rune c) { + usize len = utf8len(c); + if (!s || len == 0) + return len; + + for (usize i = len-1; i > 0; i--) { /* Continuation bytes */ + ((uchar *)s)[i] = 0x80 | (c & 0x3F); + c >>= 6; + } + ((uchar *)s)[0] = utf8byte[len-1] | (uchar)c; /* Leading byte */ + + return len; +} + +usize +utf8decode(rune *c, char *s, usize slen) { + if (c) + *c = RUNE_BAD; + + if (!s || slen == 0) /* No input? */ + return 0; + + /* Determine encoded sequence length based on first byte */ + usize len = 1; + for (; len <= UTF8_SIZE; len++) { + if (((uchar)s[0] & utf8mask[len-1]) == utf8byte[len-1]) + break; + } + if (len > UTF8_SIZE) /* Invalid leading byte? */ + return 1; + if (len > slen) /* Not enough input? */ + return 0; + + /* Decode codepoint */ + rune r = (uchar)s[0] & ~utf8mask[len-1]; + for (usize i = 1; i < len; i++) { + if (((uchar)s[i] & 0xC0) != 0x80) /* Invalid continuation byte? */ + return i; + r = (r << 6) | ((uchar)s[i] & 0x3F); + } + + if (c && utf8encodable(r) && !utf8overlong(r, len)) + *c = r; + return len; +}