From 388068bc1d06b5a5b2411a982398c48e2b79aadf Mon Sep 17 00:00:00 2001 From: Avril Date: Sun, 16 Apr 2023 04:55:31 +0100 Subject: [PATCH] exopt::util::leven_diff(): Added working levenshtein distance algorithm. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fortune for libexopt's current commit: Half blessing − 半吉 --- .gitignore | 4 + Makefile | 234 ++++++++++++++++++++++++++++++++++++++++++++++++ include/leven.h | 143 +++++++++++++++++++++++++++++ src/leven.cpp | 12 +++ 4 files changed, 393 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 include/leven.h create mode 100644 src/leven.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f30e0fb --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +obj/ +lib*.so +lib*.a +*.o diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..99955de --- /dev/null +++ b/Makefile @@ -0,0 +1,234 @@ +# Makefile template, generic for libraries (static + shared) +PROJECT=exopt +DEFAULT_NAMESPACE=exopt +VERSION=0.0.0 + +SRC_C = $(wildcard src/*.c) +SRC_CXX = $(wildcard src/*.cpp) + +INCLUDE=include + +ifeq ($(PREFIX),) + PREFIX := /usr/local +endif + +# Default archivers +AR?=ar +RANLIB?=ranlib + +# Use gcc-{ar,ranlib} when using gcc +ifeq ($(CXX),g++) + AR=gcc-ar + RANLIB=gcc-ranlib +endif + +# Link to these libraries dynamicalls +SHARED_LIBS= +# Link to these libraries statically +STATIC_LIBS= + +override __VERSION_SPLIT:= $(subst ., ,$(VERSION)) +override __VERSION_REVISION:=$(word 3,$(__VERSION_SPLIT)) 0 + +VERSION_MAJOR:= $(word 1,$(__VERSION_SPLIT)) +VERSION_MINOR:= $(word 2,$(__VERSION_SPLIT)) +VERSION_BUGFIX:= $(word 3,$(__VERSION_SPLIT)) +VERSION_REVISION:= $(word 2,$(subst r, ,$(__VERSION_REVISION))) + +override __VERSION_SPLIT:= MAJOR:$(word 1,$(__VERSION_SPLIT)) MINOR:$(word 2,$(__VERSION_SPLIT)) BUGFIX:$(word 1,$(subst r, ,$(__VERSION_REVISION))) REVISION:$(word 2,$(subst r, ,$(__VERSION_REVISION))) REVISION_STRING:$(word 3,$(__VERSION_SPLIT)) + +COMMON_FLAGS?= -W -Wall +COMMON_FLAGS+= -pipe -Wstrict-aliasing -fno-strict-aliasing $(addprefix -I,$(INCLUDE)) +COMMON_FLAGS+= $(addprefix -D_VERSION_,$(subst :,=,$(__VERSION_SPLIT))) '-D_VERSION="$(VERSION)"' + +ifneq ($(DEFAULT_NAMESPACE),) + COMMON_FLAGS+= '-D_DEFAULT_NS=$(DEFAULT_NAMESPACE)' +endif + +## CPU feature flags +CPU_FLAGS+= + +DEFINITIONS?=_IMPL + +COMMON_FLAGS+=$(addprefix -D,$(DEFINITIONS)) +COMMON_FLAGS+=$(addprefix -m,$(CPU_FLAGS)) + +## For LTO and linking over all TUs in general +BINFLAGS+= + +DEBUG_BINFLAGS+= +RELEASE_BINFLAGS+= -fuse-linker-plugin + +# Target arch. Set to blank for generic +ARCH?=native +# Enable OpenMP and loop parallelisation? (dyn-links to openmp) +PARALLEL?=yes + +OPT_FLAGS?= -fgraphite \ + -floop-interchange -ftree-loop-distribution -floop-strip-mine -floop-block \ + -fno-stack-check + +SHARED_FLAGS+=-fPIC +SHARED_RELEASE_FLAGS+= +SHARED_DEBUG_FLAGS+= + +STATIC_FLAGS+= +STATIC_RELEASE_FLAGS+=-ffat-lto-objects +STATIC_DEBUG_FLAGS+= + +ifneq ($(ARCH),) + OPT_FLAGS+= $(addprefix -march=,$(ARCH)) +endif + +ifeq ($(PARALLEL),yes) + SHARED_FLAGS+= -fopenmp + SHARED_RELEASE_FLAGS+= -floop-parallelize-all -ftree-parallelize-loops=4 +endif + +CXX_OPT_FLAGS?= $(OPT_FLAGS) -felide-constructors + +CSTD?=gnu2x +CXXSTD?=gnu++23 + +CFLAGS += $(COMMON_FLAGS) --std=$(CSTD) +CXXFLAGS += $(COMMON_FLAGS) --std=$(CXXSTD) +LDFLAGS += $(addsuffix .a,$(addprefix -l:lib,$(STATIC_LIBS))) $(addprefix -l,$(SHARED_LIBS)) + +STRIP=strip + +# TODO: XXX: Benchmark to see if `-fno-plt` actually helps... +RELEASE_COMMON_FLAGS+= -fno-plt -fno-bounds-check + +DEBUG_COMMON_FLAGS+= -ggdb -gz -ftrapv -fbounds-check +# -fanalyzer + +ifneq ($(TARGET_SPEC_FLAGS),no) + RELEASE_CFLAGS?= -O3 -flto $(OPT_FLAGS) + RELEASE_CXXFLAGS?= -O3 -flto $(CXX_OPT_FLAGS) + RELEASE_LDFLAGS?= -Wl,-O3 -Wl,-flto + + #SHARED_FLAGS+=$(SHARED_RELEASE_FLAGS) + + DEBUG_CFLAGS?= -Og + DEBUG_CXXFLAGS?= -Og + + DEBUG_LDFLAGS?= +endif + +DEBUG_CFLAGS+=-DDEBUG $(DEBUG_COMMON_FLAGS) +DEBUG_CXXFLAGS+=-DDEBUG $(DEBUG_COMMON_FLAGS) -fasynchronous-unwind-tables + +RELEASE_CFLAGS+=-DRELEASE $(RELEASE_COMMON_FLAGS) +RELEASE_CXXFLAGS+=-DRELEASE $(RELEASE_COMMON_FLAGS) + +# Objects + +OBJ_C = $(addprefix obj/c/,$(SRC_C:.c=.o)) +OBJ_CXX = $(addprefix obj/cxx/,$(SRC_CXX:.cpp=.o)) +OBJ = $(OBJ_C) $(OBJ_CXX) + +# Phonies + +.PHONY: release +release: | dirs + $(MAKE) lib$(PROJECT).a + @$(MAKE) clean-rebuild >> /dev/null + @$(MAKE) dirs >> /dev/null + $(MAKE) lib$(PROJECT).so + +.PHONY: debug +debug: | dirs + $(MAKE) lib$(PROJECT)-debug.a + @$(MAKE) clean-rebuild >> /dev/null + @$(MAKE) dirs >> /dev/null + $(MAKE) lib$(PROJECT)-debug.so + +# Rebuild both release and debug targets from scratch +.PHONY: all +all: | clean + @$(MAKE) release + @$(MAKE) clean-rebuild + @$(MAKE) debug + +.PHONY: install +.PHONY: uninstall + +.PHONY: test +test: + @rm -f $(PROJECT)-test + @$(MAKE) $(PROJECT)-test + +# Targets + +dirs: + @mkdir -p obj/c{,xx}/src{,/rng} + +obj/c/%.o: %.c + $(CC) -c $< $(CFLAGS) -o $@ $(LDFLAGS) + +obj/cxx/%.o: %.cpp + $(CXX) -c $< $(CXXFLAGS) -o $@ $(LDFLAGS) + +lib$(PROJECT)-release.a: CFLAGS+= $(RELEASE_CFLAGS) $(STATIC_FLAGS) $(STATIC_RELEASE_FLAGS) +lib$(PROJECT)-release.a: CXXFLAGS += $(RELEASE_CXXFLAGS) $(STATIC_FLAGS) $(STATIC_RELEASE_FLAGS) +lib$(PROJECT)-release.a: LDFLAGS += $(RELEASE_LDFLAGS) +lib$(PROJECT)-release.a: $(OBJ) + $(AR) rcs $@ $^ + $(RANLIB) $@ + +lib$(PROJECT)-debug.a: CFLAGS+= $(DEBUG_CFLAGS) $(STATIC_FLAGS) $(STATIC_DEBUG_FLAGS) +lib$(PROJECT)-debug.a: CXXFLAGS += $(DEBUG_CXXFLAGS) $(STATIC_FLAGS) $(STATIC_DEBUG_FLAGS) +lib$(PROJECT)-debug.a: LDFLAGS += $(DEBUG_LDFLAGS) +lib$(PROJECT)-debug.a: $(OBJ) + $(AR) rcs $@ $^ + $(RANLIB) $@ + +lib$(PROJECT)-release.so: CFLAGS+= $(RELEASE_CFLAGS) $(SHARED_FLAGS) $(SHARED_RELEASE_FLAGS) +lib$(PROJECT)-release.so: CXXFLAGS += $(RELEASE_CXXFLAGS) $(SHARED_FLAGS) $(SHARED_RELEASE_FLAGS) +lib$(PROJECT)-release.so: LDFLAGS += $(RELEASE_LDFLAGS) +lib$(PROJECT)-release.so: BINFLAGS += $(RELEASE_BINFLAGS) +lib$(PROJECT)-release.so: $(OBJ) + $(CXX) -shared $^ $(BINFLAGS) $(CXXFLAGS) -o $@ $(LDFLAGS) + $(STRIP) $@ + +lib$(PROJECT)-debug.so: CFLAGS+= $(DEBUG_CFLAGS) $(SHARED_FLAGS) $(SHARED_DEBUG_FLAGS) +lib$(PROJECT)-debug.so: CXXFLAGS += $(DEBUG_CXXFLAGS) $(SHARED_FLAGS) $(SHARED_DEBUG_FLAGS) +lib$(PROJECT)-debug.so: LDFLAGS += $(DEBUG_LDFLAGS) +lib$(PROJECT)-debug.so: BINFLAGS += $(DEBUG_BINFLAGS) +lib$(PROJECT)-debug.so: $(OBJ) + $(CXX) -shared $^ $(BINFLAGS) $(CXXFLAGS) -o $@ $(LDFLAGS) + +lib$(PROJECT).a: lib$(PROJECT)-release.a + ln -f $< $@ + +lib$(PROJECT).so: LDFLAGS+= -Wl,-soname,lib$(PROJECT).so.$(VERSION_MAJOR) +lib$(PROJECT).so: lib$(PROJECT)-release.so + ln -f $< $@.$(VERSION) + ln -sf $@.$(VERSION) $@.$(VERSION_MAJOR) + ln -sf $@.$(VERSION_MAJOR) $@ + +clean-rebuild: + rm -rf obj + +clean: clean-rebuild + rm -f lib$(PROJECT){,-{release,debug,pgo}}.{a,so{,.*}} + rm -f $(PROJECT)-test + +install: + install -d $(DESTDIR)$(PREFIX)/lib/ + install -m 644 lib$(PROJECT).a $(DESTDIR)$(PREFIX)/lib/ + install -s -m 755 lib$(PROJECT).so.$(VERSION) $(DESTDIR)$(PREFIX)/lib/ + ln -sf lib$(PROJECT).so.$(VERSION) $(DESTDIR)$(PREFIX)/lib/lib$(PROJECT).so.$(VERSION_MAJOR) + ln -sf lib$(PROJECT).so.$(VERSION_MAJOR) $(DESTDIR)$(PREFIX)/lib/lib$(PROJECT).so + install -d $(DESTDIR)$(PREFIX)/include/$(PROJECT)/ + install -m 644 $(wildcard $(INCLUDE)/*.*) $(DESTDIR)$(PREFIX)/include/$(PROJECT)/ +uninstall: + -rm $(DESTDIR)$(PREFIX)/lib/lib$(PROJECT).{a,so{,.*}} + cd $(INCLUDE) && find . -type f | xargs -I {} rm "$(DESTDIR)$(PREFIX)/include/$(PROJECT)/{}" + -rmdir $(DESTDIR)$(PREFIX)/include/$(PROJECT) + +$(PROJECT)-test: LDFLAGS+= -lfmt -lstdc++ +$(PROJECT)-test: CFLAGS+= -Og -g +$(PROJECT)-test: lib$(PROJECT)-debug.a + $(CC) $(CFLAGS) src/test/*.c -o $@ -l:$< $(LDFLAGS) + -valgrind ./$@ diff --git a/include/leven.h b/include/leven.h new file mode 100644 index 0000000..0eb26f8 --- /dev/null +++ b/include/leven.h @@ -0,0 +1,143 @@ +#ifndef _LEVEN_H +#define _LEVEN_H + +#ifdef __cplusplus + +#include +#include +#include +#include +#include + +extern "C" { +#endif + +#ifdef __cplusplus +} + +namespace exopt { namespace util [[gnu::visibility("internal")]] { +#if 0 + void** allocate_array(size_t elem, size_t d1, size_t d2) [[gnu::returns_nonnull]]; + void deallocate_array(void** array) noexcept; + + /*constexpr auto min(auto const& a, auto const& b) noexcept + -> std::common_type_t + requires(requires(decltype(a) _a, decltype(b) _b) { + { _a < _b } -> std::convertible_to; + }) { + return a < b ? a : b; + }*/ + + /// Compute the levenshtein distance. + constexpr auto leven_diff(std::convertible_to auto const& a, std::convertible_to auto const& b) noexcept { + std::string_view s = a, + t = b; + const signed long n = long(s.size()); + const signed long m = long(t.size()); + + if(__builtin_expect(!n, 0)) return m; + if(__builtin_expect(!m, 0)) return n; + + //constexpr + auto&& work_on_d = [&](auto& d) { + for(long i=0; i <= n ; d[i][0] = i) i+=1; + for(long i=0; i <= m ; d[0][i] = i) i+=1; + + for(long i=1; i <= n; i++) { + for(long j=1; j<=m; j++) + { + using std::min; + const auto cost = std::type_identity_t(! (t[j-1] == s[i-1])); + d[i][j] = min( + min(d[i-1][j] + 1, d[i][j-1] + 1) + , d[i-1][j-1] + cost); + } + } + return d[n][m]; + }; + //TODO: XXX Multidimensional array not working... fuck THIS + if consteval { + using Vec2 = std::vector>; + struct vec : public Vec2 ///TODO: XXX: best way to do this? We just want the `d`` ctor overload... + { +//TODO: for below ctor of inner (size_t, constT&), since it's ambiguous and chooses the incorrect one no matter what + using Vec2::Vec2;//(size_t, signed long const&, Vec2::allocator_type const&); + vec(signed long, signed long) = delete; + vec(signed long, size_t) = delete; + vec(size_t) = delete; + + constexpr ~vec() noexcept = default; + + + constexpr operator Vec2&&() && noexcept { return std::move(*static_cast(this)); } + constexpr operator Vec2&() const&& = delete; + }; + + const auto inner = std::vector{ size_t(m) + 1 }; + std::vector> d{ static_cast(std::move(vec{size_t(n) + 1, inner})) }; + return work_on_d(d); + } else { + auto** d = reinterpret_cast(allocate_array(sizeof(signed long), m+1, n+1)); + try { + return work_on_d(d); + } catch(...) { + deallocate_array(reinterpret_cast(d)); + throw; + } + } + } +#endif + constexpr auto leven_diff(std::string_view s1, std::string_view s2) noexcept { + const size_t + m(s1.size()), + n(s2.size()); + + if(__builtin_expect(!m, false)) return n; + if(__builtin_expect(!n, false)) return m; + + auto&& work_on_d = [&](auto&& costs) { + for(size_t i=0;i <= n; i++) costs[i] = i; + + size_t i{0}; + for(auto const& c1: s1) { + costs[0] = i + 1; + size_t corner { i }, + j { 0 }; + + for(auto const& c2: s2) { + size_t upper { costs[j+1] }; + if( c1 == c2 ) costs[j+1] = corner; + else { + using std::min; + size_t t{min(upper, corner)};//upper(n + 1)); + } else { + thread_local static std::vector d; + const size_t n1 = n + 1; + if(__builtin_expect(d.size() < n1, false)) d.resize(n1); + // We don't need to clear the buffer, it will be reinitialised by `work_on_d()`. + return work_on_d(std::span{d.begin(), n1}); + } + } + template S1, std::convertible_to S2> + constexpr decltype(auto) leven_diff(const S1& sa, const S2& sb) noexcept { + using str = std::string_view; + return leven_diff(str{sa}, str{sb}); + } + +} } +#endif + +#endif /* _LEVEN_H */ diff --git a/src/leven.cpp b/src/leven.cpp new file mode 100644 index 0000000..83bd798 --- /dev/null +++ b/src/leven.cpp @@ -0,0 +1,12 @@ + +#include +#include + +#include + +namespace exopt { + namespace util [[gnu::visibility("internal")]] { + static_assert(leven_diff("hello world", "Hello World") == 2, "Levelshtein distance incorrect for non-matching strings"); + static_assert(leven_diff("hello world", "hello world") == 0, "Levelshtein distance incorrect for matching strings"); + } +}