rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit dfce8a98479c80a98ce8c1017394c7f2d93e4814
parent c88e97c7aac0df9bdb9664c7bb9c1a72cfab277e
Author: robert <robertrussell.72001@gmail.com>
Date:   Wed, 10 Aug 2022 18:56:19 -0700

Add function to get unicode category of given rune

Much better than using ICU or some other awful library.

Diffstat:
M.gitignore | 2++
MMakefile | 16+++++++++++++++-
Mconfig.mk | 2++
Minc/cext/all.h | 1+
Ainc/cext/unicode.h | 5+++++
Asrc/unicode.c | 36++++++++++++++++++++++++++++++++++++
Atool/ucattab.c | 111+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore @@ -1,2 +1,4 @@ *.o *.a +gen/* +tool/ucattab diff --git a/Makefile b/Makefile @@ -8,6 +8,7 @@ SRC =\ src/log.c\ src/opt.c\ src/str.c\ + src/unicode.c\ src/utf8.c libcext.a: $(SRC:.c=.o) @@ -21,9 +22,22 @@ src/bench.o: src/bench.c inc/cext/bench.h inc/cext/cext.h inc/cext/def.h inc/cex src/log.o: src/log.c inc/cext/cext.h inc/cext/def.h inc/cext/log.h config.mk src/opt.o: src/opt.c inc/cext/cext.h inc/cext/def.h inc/cext/opt.h config.mk src/str.o: src/str.c inc/cext/alloc.h inc/cext/cext.h inc/cext/def.h inc/cext/log.h inc/cext/str.h config.mk +src/unicode.o: src/unicode.c inc/cext/cext.h inc/cext/def.h gen/ucattab.inc config.mk src/utf8.o: src/utf8.c inc/cext/cext.h inc/cext/def.h inc/cext/utf8.h config.mk +gen/ucattab.inc: gen tool/ucattab gen/UnicodeData.txt + tool/ucattab gen/UnicodeData.txt > $@ + +gen/UnicodeData.txt: gen + wget -q -O $@ $(UNICODE_DATA_URL) + +gen: + mkdir gen + +tool/ucattab: tool/ucattab.c src/alloc.o src/log.o src/str.o + $(CC) -o $@ $(CFLAGS) $^ + clean: - rm -f libcext.a $(SRC:.c=.o) + rm -rf libcext.a $(SRC:.c=.o) gen .PHONY: clean diff --git a/config.mk b/config.mk @@ -1,3 +1,5 @@ CC = cc CFLAGS = -std=c11 -O2 -Wall -pedantic -Iinc AR = ar + +UNICODE_DATA_URL = https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt diff --git a/inc/cext/all.h b/inc/cext/all.h @@ -5,5 +5,6 @@ #include "cext/log.h" #include "cext/opt.h" #include "cext/str.h" +#include "cext/unicode.h" #include "cext/utf8.h" #include "cext/vector.h" diff --git a/inc/cext/unicode.h b/inc/cext/unicode.h @@ -0,0 +1,5 @@ +#pragma once + +#include "cext/def.h" + +char *cext_unicode_category(rune r); diff --git a/src/unicode.c b/src/unicode.c @@ -0,0 +1,36 @@ +#include "cext/cext.h" +#include "../gen/ucattab.inc" + +static char ucats[] = + "Lu\0Ll\0Lt\0Lm\0Lo\0" + "Mn\0Mc\0Me\0" + "Nd\0Nl\0No\0" + "Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0" + "Sm\0Sc\0Sk\0So\0" + "Zs\0Zl\0Zp\0" + "Cc\0Cf\0Cs\0Co\0Cn"; + +char * +cext_unicode_category(rune r) { + if (r <= 0xff) /* Latin 1 */ + return &ucats[3 * ucatl1tab[r]]; + + if (r > RUNE_MAX) + return 0; + + /* Binary search ucattab */ + usize l = 0; + usize u = LEN(ucattab); + while (l < u) { + usize i = l + (u-l)/2; + u32 entry = ucattab[i]; + if (r < (entry & 0xffffff)) + u = i; + else if (i + 1 < LEN(ucattab) && r >= (ucattab[i+1] & 0xffffff)) + l = i + 1; + else + return &ucats[3 * (entry >> 24)]; + } + + return 0; +} diff --git a/tool/ucattab.c b/tool/ucattab.c @@ -0,0 +1,111 @@ +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "cext/alloc.h" +#include "cext/cext.h" +#include "cext/log.h" +#include "cext/str.h" + +#define NF 15 /* Number of fields in UnicodeData.txt */ + +u8 +cattoi(char *cat) { + static char ucats[] = + "Lu\0Ll\0Lt\0Lm\0Lo\0" + "Mn\0Mc\0Me\0" + "Nd\0Nl\0No\0" + "Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0" + "Sm\0Sc\0Sk\0So\0" + "Zs\0Zl\0Zp\0" + "Cc\0Cf\0Cs\0Co\0Cn"; + for (usize i = 0; i < LEN(ucats); i += 3) { + if (!strcmp(cat, &ucats[i])) + return i / 3; + } + fatalf("bad category '%s'", cat); + return 0; /* Suppress warning */ +} + +u8 * +parse_cats(char *filename) { + FILE *f = fopen(filename, "rb"); + if (!f) fatalf("fopen: %s", strerror(errno)); + + u8 *cats = ealloc(RUNE_MAX + 1); + + char line[512]; + bool inrange = false; + i32 prevcp = -1; + while (fgets(line, sizeof line, f)) { + char *nl = strchr(line, '\n'); + if (!nl) fatalf("line too long"); + *nl = '\0'; + + char **fields = (char *[NF]){0}; + if (cext_str_split(&fields, line, ";", NF) != NF) + fatalf("line has too few fields"); + i32 cp = strtol(fields[0], 0, 16); + char *name = fields[1]; + u8 cat = cattoi(!strcmp(fields[2], "") ? "Cn" : fields[2]); + + /* We expect UnicodeData.txt to be sorted, but I can't find any + * guarantee of that in UAX #44. */ + assert(cp > prevcp); + assert(!inrange || cext_str_ends_with(name, "Last>")); + + for (i32 c = prevcp+1; c <= cp; c++) + cats[c] = inrange || c == cp ? cat : cattoi("Cn"); + + inrange = cext_str_ends_with(name, "First>"); + prevcp = cp; + } + if (!feof(f)) fatalf("fgets: %s", strerror(errno)); + + for (i32 c = prevcp+1; c <= RUNE_MAX; c++) + cats[c] = cattoi("Cn"); + + fclose(f); + + return cats; +} + +int +main(int argc, char **argv) { + if (argc != 2) + fatalf("usage: %s UNICODE_DATA_FILE", argv[0]); + + u8 *cats = parse_cats(argv[1]); + + printf( + "/* This file is automatically generated. */\n" + "\n" + "/* Special table to optimize for Latin 1 */\n" + "static u8 ucatl1tab[] = {" + ); + + for (rune i = 0; i <= 0xff; i++) + printf("%s0x%02x,", i%8 == 0 ? "\n\t" : " ", cats[i]); + + printf( + "\n}; /* 256 bytes */\n" + "\n" + "/* The high byte is the category, and the low three bytes are the\n" + " * codepoint. Codepoints are listed in order, with consecutive\n" + " * entries with the same category compressed into one entry. */\n" + "static u32 ucattab[] = {" + ); + + usize tablen = 0; + for (rune i = 0x100; i <= RUNE_MAX; i++) { + if (i > 0x100 && cats[i] == cats[i-1]) + continue; + printf("%s0x%08"PRIx32"ul,", tablen%4 == 0 ? "\n\t" : " ", + ((rune) cats[i] << 24) | i); + tablen++; + } + + printf("\n}; /* %zd bytes */\n", 4 * tablen); +}