diff --git a/libraries/config/common.mk b/libraries/config/common.mk
index b05082292..964a7d0aa 100644
--- a/libraries/config/common.mk
+++ b/libraries/config/common.mk
@@ -76,17 +76,12 @@ TARGET       := $(notdir $(CURDIR))
 BUILD        := build
 DATA         := data
 INCLUDES     := include
-SOURCES      ?= source $(foreach d,$(filter-out source/arch source/board source,$(wildcard source/*)),$(if $(wildcard $d/.),$(call DIR_WILDCARD,$d) $d,))
 
-ifneq ($(strip $(wildcard source/$(ATMOSPHERE_ARCH_DIR)/.*)),)
-SOURCES += source/$(ATMOSPHERE_ARCH_DIR) $(call DIR_WILDCARD,source/$(ATMOSPHERE_ARCH_DIR))
-endif
-ifneq ($(strip $(wildcard source/$(ATMOSPHERE_BOARD_DIR)/.*)),)
-SOURCES += source/$(ATMOSPHERE_BOARD_DIR) $(call DIR_WILDCARD,source/$(ATMOSPHERE_BOARD_DIR))
-endif
-ifneq ($(strip $(wildcard source/$(ATMOSPHERE_OS_DIR)/.*)),)
-SOURCES += source/$(ATMOSPHERE_OS_DIR) $(call DIR_WILDCARD,source/$(ATMOSPHERE_OS_DIR))
-endif
+GENERAL_SOURCE_DIRS=$1 $(foreach d,$(filter-out $1/arch $1/board $1,$(wildcard $1/*)),$(if $(wildcard $d/.),$(call DIR_WILDCARD,$d) $d,))
+SPECIFIC_SOURCE_DIRS=$(if $(wildcard $1/$2/.*),$1/$2 $(call DIR_WILDCARD,$1/$2),)
+ALL_SOURCE_DIRS=$(call GENERAL_SOURCE_DIRS,$1) $(call SPECIFIC_SOURCE_DIRS,$1,$(ATMOSPHERE_ARCH_DIR)) $(call SPECIFIC_SOURCE_DIRS,$1,$(ATMOSPHERE_BOARD_DIR)) $(call SPECIFIC_SOURCE_DIRS,$1,$(ATMOSPHERE_OS_DIR))
+
+SOURCES      ?= $(call ALL_SOURCE_DIRS,source)
 
 #---------------------------------------------------------------------------------
 # Rules for compiling pre-compiled headers
diff --git a/libraries/libmesosphere/Makefile b/libraries/libmesosphere/Makefile
index be6fb94d6..79213e4fc 100644
--- a/libraries/libmesosphere/Makefile
+++ b/libraries/libmesosphere/Makefile
@@ -14,6 +14,8 @@ CFLAGS      := $(ATMOSPHERE_CFLAGS) $(SETTINGS) $(DEFINES) $(INCLUDE)
 CXXFLAGS    := $(CFLAGS) $(ATMOSPHERE_CXXFLAGS) -fno-use-cxa-atexit -flto
 ASFLAGS     := $(ATMOSPHERE_ASFLAGS) $(SETTINGS)
 
+SOURCES     += $(call ALL_SOURCE_DIRS,../libvapours/source)
+
 LIBS        :=
 
 #---------------------------------------------------------------------------------
diff --git a/libraries/libstratosphere/Makefile b/libraries/libstratosphere/Makefile
index d7a3965ea..16328c048 100644
--- a/libraries/libstratosphere/Makefile
+++ b/libraries/libstratosphere/Makefile
@@ -23,6 +23,8 @@ ASFLAGS     := $(ATMOSPHERE_ASFLAGS) $(SETTINGS)
 
 LDFLAGS     := -specs=$(DEVKITPRO)/libnx/switch.specs $(SETTINGS) -Wl,-Map,$(notdir $*.map)
 
+SOURCES     += $(call ALL_SOURCE_DIRS,../libvapours/source)
+
 LIBS        := -lnx
 
 #---------------------------------------------------------------------------------
diff --git a/libraries/libvapours/include/vapours/assert.hpp b/libraries/libvapours/include/vapours/assert.hpp
index b4fe1fbee..74a5f0590 100644
--- a/libraries/libvapours/include/vapours/assert.hpp
+++ b/libraries/libvapours/include/vapours/assert.hpp
@@ -19,7 +19,7 @@
 namespace ams::impl {
 
     template<typename... ArgTypes>
-    ALWAYS_INLINE void UnusedImpl(ArgTypes... args) {
+    constexpr ALWAYS_INLINE void UnusedImpl(ArgTypes... args) {
         (static_cast<void>(args), ...);
     }
 
diff --git a/libraries/libvapours/include/vapours/crypto.hpp b/libraries/libvapours/include/vapours/crypto.hpp
index 274db7952..a468ee98a 100644
--- a/libraries/libvapours/include/vapours/crypto.hpp
+++ b/libraries/libvapours/include/vapours/crypto.hpp
@@ -18,3 +18,5 @@
 #include <vapours/defines.hpp>
 
 #include <vapours/crypto/crypto_memory_compare.hpp>
+#include <vapours/crypto/crypto_memory_clear.hpp>
+#include <vapours/crypto/impl/crypto_bignum.hpp>
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_memory_clear.hpp b/libraries/libvapours/include/vapours/crypto/crypto_memory_clear.hpp
new file mode 100644
index 000000000..6a8be44e8
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_memory_clear.hpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+
+namespace ams::crypto {
+
+    void ClearMemory(void *mem, size_t size);
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_memory_compare.hpp b/libraries/libvapours/include/vapours/crypto/crypto_memory_compare.hpp
index 4798f7d4f..68040a4b8 100644
--- a/libraries/libvapours/include/vapours/crypto/crypto_memory_compare.hpp
+++ b/libraries/libvapours/include/vapours/crypto/crypto_memory_compare.hpp
@@ -19,20 +19,8 @@
 #include <vapours/assert.hpp>
 #include <vapours/util.hpp>
 
-#ifdef ATMOSPHERE_ARCH_ARM64
-
-#include <vapours/crypto/impl/crypto_memory_compare.arch.arm64.hpp>
-
-#else
-
-#error "Unknown architecture for crypto::IsSameBytes"
-
-#endif
-
 namespace ams::crypto {
 
-    inline bool IsSameBytes(const void *lhs, const void *rhs, size_t size) {
-        return impl::IsSameBytes(lhs, rhs, size);
-    }
+    bool IsSameBytes(const void *lhs, const void *rhs, size_t size);
 
 }
diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_bignum.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_bignum.hpp
new file mode 100644
index 000000000..585e41975
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_bignum.hpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/crypto_memory_compare.hpp>
+#include <vapours/crypto/crypto_memory_clear.hpp>
+
+namespace ams::crypto::impl {
+
+    class BigNum {
+        NON_COPYABLE(BigNum);
+        NON_MOVEABLE(BigNum);
+        public:
+            using HalfWord   = u16;
+            using Word       = u32;
+            using DoubleWord = u64;
+
+            static constexpr size_t MaxBits = 4096;
+            static constexpr size_t BitsPerWord = sizeof(Word) * CHAR_BIT;
+            static constexpr Word MaxWord     = std::numeric_limits<Word>::max();
+            static constexpr Word MaxHalfWord = std::numeric_limits<HalfWord>::max();
+
+            class WordAllocator {
+                    NON_COPYABLE(WordAllocator);
+                    NON_MOVEABLE(WordAllocator);
+                public:
+                    class Allocation {
+                        NON_COPYABLE(Allocation);
+                        NON_MOVEABLE(Allocation);
+                        private:
+                            friend class WordAllocator;
+                        private:
+                            WordAllocator *allocator;
+                            Word *buffer;
+                            size_t count;
+                        private:
+                            constexpr ALWAYS_INLINE Allocation(WordAllocator *a, Word *w, size_t c) : allocator(a), buffer(w), count(c) { /* ... */ }
+                        public:
+                            ALWAYS_INLINE ~Allocation() { if (allocator) { allocator->Free(this->buffer, this->count); } }
+
+                            constexpr ALWAYS_INLINE Word *GetBuffer() const { return this->buffer; }
+                            constexpr ALWAYS_INLINE size_t GetCount() const { return this->count; }
+                            constexpr ALWAYS_INLINE bool IsValid() const { return this->buffer != nullptr; }
+                    };
+
+                    friend class Allocation;
+                private:
+                    Word *buffer;
+                    size_t count;
+                    size_t max_count;
+                    size_t min_count;
+                private:
+                    ALWAYS_INLINE void Free(void *words, size_t num) {
+                        this->buffer -= num;
+                        this->count  += num;
+
+                        AMS_ASSERT(words == this->buffer);
+                    }
+                public:
+                    constexpr ALWAYS_INLINE WordAllocator(Word *buf, size_t c) : buffer(buf), count(c), max_count(c), min_count(c) { /* ... */ }
+
+                    ALWAYS_INLINE Allocation Allocate(size_t num) {
+                        if (num <= this->count) {
+                            Word *allocated = this->buffer;
+
+                            this->buffer += num;
+                            this->count -= num;
+                            this->min_count = std::min(this->count, this->min_count);
+
+                            return Allocation(this, allocated, num);
+                        } else {
+                            return Allocation(nullptr, nullptr, 0);
+                        }
+                    }
+
+                    constexpr ALWAYS_INLINE size_t GetMaxUsedSize() const {
+                        return (this->max_count - this->min_count) * sizeof(Word);
+                    }
+            };
+        private:
+            Word *words;
+            size_t num_words;
+            size_t max_words;
+        private:
+            static void ImportImpl(Word *out, size_t out_size, const u8 *src, size_t src_size);
+            static void ExportImpl(u8 *out, size_t out_size, const Word *src, size_t src_size);
+        public:
+            constexpr BigNum() : words(), num_words(), max_words() { /* ... */ }
+            ~BigNum() { /* ... */ }
+
+            constexpr void ReserveStatic(Word *buf, size_t capacity) {
+                this->words = buf;
+                this->max_words = capacity;
+            }
+
+            bool Import(const void *src, size_t src_size);
+            void Export(void *dst, size_t dst_size);
+
+            size_t GetSize() const;
+
+            bool IsZero() const {
+                return this->num_words == 0;
+            }
+
+            bool ExpMod(void *dst, const void *src, size_t size, const BigNum &exp, u32 *work_buf, size_t work_buf_size) const;
+            void ClearToZero();
+            void UpdateCount();
+        public:
+            /* Utility. */
+            static bool   IsZero(const Word *w, size_t num_words);
+            static int    Compare(const Word *lhs, const Word *rhs, size_t num_words);
+            static size_t CountWords(const Word *w, size_t num_words);
+            static size_t CountSignificantBits(Word w);
+            static void   ClearToZero(Word *w, size_t num_words);
+            static void   SetToWord(Word *w, size_t num_words, Word v);
+            static void   Copy(Word *dst, const Word *src, size_t num_words);
+
+            /* Arithmetic. */
+            static bool   ExpMod(Word *dst, const Word *src, const Word *exp, size_t exp_num_words, const Word *mod, size_t mod_num_words, WordAllocator *allocator);
+            static bool   MultMod(Word *dst, const Word *src, const Word *mult, const Word *mod, size_t num_words, WordAllocator *allocator);
+            static bool   Mod(Word *dst, const Word *src, size_t src_words, const Word *mod, size_t mod_words, WordAllocator *allocator);
+            static bool   DivMod(Word *quot, Word *rem, const Word *top, size_t top_words, const Word *bot, size_t bot_words, WordAllocator *allocator);
+            static bool   Mult(Word *dst, const Word *lhs, const Word *rhs, size_t num_words, WordAllocator *allocator);
+
+            static Word   LeftShift(Word *dst, const Word *w, size_t num_words, const size_t shift);
+            static Word   RightShift(Word *dst, const Word *w, size_t num_words, const size_t shift);
+            static Word   Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words);
+            static Word   Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words);
+            static Word   MultAdd(Word *dst, const Word *w, size_t num_words, Word mult);
+            static Word   MultSub(Word *dst, const Word *w, const Word *v, size_t num_words, Word mult);
+    };
+
+    template<size_t Bits>
+    class StackBigNum : public BigNum {
+        public:
+            static constexpr size_t NumBits  = Bits;
+            static constexpr size_t NumWords = util::AlignUp(NumBits, BitsPerWord) / BitsPerWord;
+            static constexpr size_t NumBytes = NumWords * sizeof(Word);
+        private:
+            Word word_buf[NumWords];
+        public:
+            constexpr StackBigNum() : word_buf() {
+                this->ReserveStatic(word_buf, NumWords);
+            }
+    };
+
+}
diff --git a/libraries/libvapours/include/vapours/svc/svc_common.hpp b/libraries/libvapours/include/vapours/svc/svc_common.hpp
index e5e3c3353..49bd91327 100644
--- a/libraries/libvapours/include/vapours/svc/svc_common.hpp
+++ b/libraries/libvapours/include/vapours/svc/svc_common.hpp
@@ -24,10 +24,8 @@ namespace ams::svc {
     /* TODO: C++ style handle? */
 #ifdef ATMOSPHERE_IS_STRATOSPHERE
     using Handle = ::Handle;
-#elif defined ATMOSPHERE_IS_MESOSPHERE
-    using Handle = u32;
 #else
-    #error "Unknown target for svc::Handle"
+    using Handle = u32;
 #endif
 
     static constexpr size_t MaxWaitSynchronizationHandleCount = 0x40;
diff --git a/libraries/libvapours/source/crypto/crypto_memory_clear.cpp b/libraries/libvapours/source/crypto/crypto_memory_clear.cpp
new file mode 100644
index 000000000..a5bb28320
--- /dev/null
+++ b/libraries/libvapours/source/crypto/crypto_memory_clear.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto {
+
+    void ClearMemory(void *_mem, size_t size) {
+        volatile u8 *mem = reinterpret_cast<volatile u8 *>(_mem);
+
+        for (size_t i = 0; i < size; i++) {
+            mem[i] = 0;
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_memory_compare.arch.arm64.hpp b/libraries/libvapours/source/crypto/crypto_memory_compare.arch.arm64.cpp
similarity index 91%
rename from libraries/libvapours/include/vapours/crypto/impl/crypto_memory_compare.arch.arm64.hpp
rename to libraries/libvapours/source/crypto/crypto_memory_compare.arch.arm64.cpp
index f1a1350f7..08dbb2d47 100644
--- a/libraries/libvapours/include/vapours/crypto/impl/crypto_memory_compare.arch.arm64.hpp
+++ b/libraries/libvapours/source/crypto/crypto_memory_compare.arch.arm64.cpp
@@ -13,14 +13,11 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+#include <vapours.hpp>
 
-#pragma once
-#include <vapours/defines.hpp>
-#include <vapours/util.hpp>
+namespace ams::crypto {
 
-namespace ams::crypto::impl {
-
-    inline bool IsSameBytes(const void *lhs, const void *rhs, size_t size) {
+    bool IsSameBytes(const void *lhs, const void *rhs, size_t size) {
         bool result;
         u8 xor_acc, ltmp, rtmp;
         size_t index;
diff --git a/libraries/libvapours/source/crypto/impl/crypto_bignum.cpp b/libraries/libvapours/source/crypto/impl/crypto_bignum.cpp
new file mode 100644
index 000000000..4b4927fe1
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_bignum.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    void BigNum::ImportImpl(Word *out, size_t out_size, const u8 *src, size_t src_size) {
+        size_t octet_ofs = src_size;
+        size_t word_ofs  = 0;
+
+        /* Parse octets into words. */
+        while (word_ofs < out_size && octet_ofs > 0) {
+            Word w = 0;
+            for (size_t shift = 0; octet_ofs > 0 && shift < BITSIZEOF(Word); shift += BITSIZEOF(u8)) {
+                w |= static_cast<Word>(src[--octet_ofs]) << shift;
+            }
+            out[word_ofs++] = w;
+        }
+
+        /* Zero-fill upper words. */
+        while (word_ofs < out_size) {
+            out[word_ofs++] = 0;
+        }
+    }
+
+    void BigNum::ExportImpl(u8 *out, size_t out_size, const Word *src, size_t src_size) {
+        size_t octet_ofs = out_size;
+
+        /* Parse words into octets. */
+        for (size_t word_ofs = 0; word_ofs < src_size && octet_ofs > 0; word_ofs++) {
+            const Word w = src[word_ofs];
+            for (size_t shift = 0; octet_ofs > 0 && shift < BITSIZEOF(Word); shift += BITSIZEOF(u8)) {
+                out[--octet_ofs] = static_cast<u8>(w >> shift);
+            }
+        }
+
+        /* Zero-clear remaining octets. */
+        while (octet_ofs > 0) {
+            out[--octet_ofs] = 0;
+        }
+    }
+
+    size_t BigNum::GetSize() const {
+        if (this->num_words == 0) {
+            return 0;
+        }
+        static_assert(sizeof(Word) == 4);
+
+        size_t size = this->num_words * sizeof(Word);
+        const Word last = this->words[this->num_words - 1];
+        AMS_ASSERT(last != 0);
+        if (last >= 0x01000000u) {
+            return size - 0;
+        } else if (last >= 0x00010000u) {
+            return size - 1;
+        } else if (last >= 0x00000100u) {
+            return size - 2;
+        } else {
+            return size - 3;
+        }
+    }
+
+    bool BigNum::Import(const void *src, size_t src_size) {
+        AMS_ASSERT((src != nullptr) || (src_size != 0));
+
+        /* Ignore leading zeroes. */
+        const u8 *data = static_cast<const u8 *>(src);
+        while (src_size > 0 && *data == 0) {
+            ++data;
+            --src_size;
+        }
+
+        /* Ensure we have space for the number. */
+        AMS_ASSERT(src_size <= this->max_words * sizeof(Word));
+        if (AMS_UNLIKELY(!(src_size <= this->max_words * sizeof(Word)))) {
+            return false;
+        }
+
+        /* Import. */
+        this->num_words = util::AlignUp(src_size, sizeof(Word)) / sizeof(Word);
+
+        ImportImpl(this->words, this->max_words, data, src_size);
+        return true;
+    }
+
+    void BigNum::Export(void *dst, size_t dst_size) {
+        AMS_ASSERT(dst_size >= this->GetSize());
+        ExportImpl(static_cast<u8 *>(dst), dst_size, this->words, this->num_words);
+    }
+
+    bool BigNum::ExpMod(void *dst, const void *src, size_t size, const BigNum &exp, u32 *work_buf, size_t work_buf_size) const {
+        /* Can't exponentiate with or about zero. */
+        if (this->IsZero() || exp.IsZero()) {
+            return false;
+        }
+        AMS_ASSERT(size == this->GetSize());
+
+        /* Create an allocator. */
+        WordAllocator allocator(work_buf, work_buf_size / sizeof(Word));
+        ON_SCOPE_EXIT { ClearMemory(work_buf, allocator.GetMaxUsedSize()); };
+
+        /* Create a BigNum for the signature. */
+        BigNum signature;
+        auto signature_words = allocator.Allocate(size / sizeof(Word));
+        if (!signature_words.IsValid()) {
+            return false;
+        }
+
+        /* Import data for the signature. */
+        signature.ReserveStatic(signature_words.GetBuffer(), signature_words.GetCount());
+        if (!signature.Import(src, size)) {
+            return false;
+        }
+
+        /* Perform the exponentiation. */
+        if (!ExpMod(signature.words, signature.words, exp.words, exp.num_words, this->words, this->num_words, std::addressof(allocator))) {
+            return false;
+        }
+
+        /* We succeeded, so export. */
+        signature.UpdateCount();
+        signature.Export(dst, size);
+
+        return true;
+    }
+
+    void BigNum::ClearToZero() {
+        std::memset(this->words, 0, this->num_words * sizeof(Word));
+    }
+
+    void BigNum::UpdateCount() {
+        this->num_words = CountWords(this->words, this->max_words);
+    }
+
+}
\ No newline at end of file
diff --git a/libraries/libvapours/source/crypto/impl/crypto_bignum_operations.cpp b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations.cpp
new file mode 100644
index 000000000..0118f931d
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations.cpp
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        constexpr ALWAYS_INLINE BigNum::Word GetTop2Bits(BigNum::Word w) {
+            return (w >> (BigNum::BitsPerWord - 2)) & 0x3u;
+        }
+
+        constexpr ALWAYS_INLINE void MultWord(BigNum::Word *dst, BigNum::Word lhs, BigNum::Word rhs) {
+            static_assert(sizeof(BigNum::DoubleWord) == sizeof(BigNum::Word) * 2);
+            BigNum::DoubleWord result = static_cast<BigNum::DoubleWord>(lhs) * static_cast<BigNum::DoubleWord>(rhs);
+            dst[0] = static_cast<BigNum::Word>(result & ~BigNum::Word());
+            dst[1] = static_cast<BigNum::Word>(result >> BITSIZEOF(BigNum::Word));
+        }
+
+        constexpr ALWAYS_INLINE BigNum::HalfWord GetUpperHalf(BigNum::Word word) {
+            static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
+            return static_cast<BigNum::HalfWord>((word >> BITSIZEOF(BigNum::HalfWord)) & ~BigNum::HalfWord());
+        }
+
+        constexpr ALWAYS_INLINE BigNum::HalfWord GetLowerHalf(BigNum::Word word) {
+            static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
+            return static_cast<BigNum::HalfWord>(word & ~BigNum::HalfWord());
+        }
+
+        constexpr ALWAYS_INLINE BigNum::Word ToUpperHalf(BigNum::HalfWord half) {
+            static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
+            return static_cast<BigNum::Word>(half) << BITSIZEOF(BigNum::HalfWord);
+        }
+
+        constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
+            static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
+            return static_cast<BigNum::Word>(half);
+        }
+
+        constexpr ALWAYS_INLINE BigNum::Word DivWord(const BigNum::Word *w, BigNum::Word div) {
+            using Word = BigNum::Word;
+            using HalfWord = BigNum::HalfWord;
+
+            Word work[2] = { w[0], w[1] };
+            HalfWord r_hi = 0, r_lo = 0;
+
+            HalfWord d_hi = GetUpperHalf(div);
+            HalfWord d_lo = GetLowerHalf(div);
+
+            if (d_hi == BigNum::MaxHalfWord) {
+                r_hi = GetUpperHalf(work[1]);
+            } else {
+                r_hi = GetLowerHalf(work[1] / (d_hi + 1));
+            }
+
+            {
+                const Word hh = static_cast<Word>(r_hi) * static_cast<Word>(d_hi);
+                const Word hl = static_cast<Word>(r_hi) * static_cast<Word>(d_lo);
+
+                const Word uhl = ToUpperHalf(static_cast<HalfWord>(hl));
+                if ((work[0] -= uhl) > (BigNum::MaxWord - uhl)) {
+                    work[1]--;
+                }
+                work[1] -= GetUpperHalf(hl);
+                work[1] -= hh;
+
+                const Word udl = ToUpperHalf(d_lo);
+                while (work[1] > d_hi || (work[1] == d_hi && work[0] >= udl)) {
+                    if ((work[0] -= udl) > (BigNum::MaxWord - udl)) {
+                        work[1]--;
+                    }
+                    work[1] -= d_hi;
+                    r_hi++;
+                }
+            }
+
+            if (d_hi == BigNum::MaxHalfWord) {
+                r_lo = GetLowerHalf(work[1]);
+            } else {
+                r_lo = GetLowerHalf((ToUpperHalf(static_cast<HalfWord>(work[1])) + GetUpperHalf(work[0])) / (d_hi + 1));
+            }
+
+            {
+                const Word ll = static_cast<Word>(r_lo) * static_cast<Word>(d_lo);
+                const Word lh = static_cast<Word>(r_lo) * static_cast<Word>(d_hi);
+
+                if ((work[0] -= ll) > (BigNum::MaxWord - ll)) {
+                    work[1]--;
+                }
+
+                const Word ulh = ToUpperHalf(static_cast<HalfWord>(lh));
+                if ((work[0] -= ulh) > (BigNum::MaxWord - ulh)) {
+                    work[1]--;
+                }
+                work[1] -= GetUpperHalf(lh);
+
+                while ((work[1] > 0) || (work[1] == 0 && work[0] >= div)) {
+                    if ((work[0] -= div) > (BigNum::MaxWord - div)) {
+                        work[1]--;
+                    }
+                    r_lo++;
+                }
+            }
+
+            return ToUpperHalf(r_hi) + r_lo;
+        }
+
+    }
+
+    bool BigNum::IsZero(const Word *w, size_t num_words) {
+        for (size_t i = 0; i < num_words; i++) {
+            if (w[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    int BigNum::Compare(const Word *lhs, const Word *rhs, size_t num_words) {
+        for (s32 i = static_cast<s32>(num_words) - 1; i >= 0; i--) {
+            if (lhs[i] > rhs[i]) {
+                return 1;
+            } else if (lhs[i] < rhs[i]) {
+                return -1;
+            }
+        }
+        return 0;
+    }
+
+    size_t BigNum::CountWords(const Word *w, size_t num_words) {
+        s32 i = static_cast<s32>(num_words) - 1;
+        while (i >= 0 && !w[i]) {
+            i--;
+        }
+        return i + 1;
+    }
+
+    size_t BigNum::CountSignificantBits(Word w) {
+        size_t i;
+        for (i = 0; i < BitsPerWord && w != 0; i++) {
+            w >>= 1;
+        }
+        return i;
+    }
+
+    void BigNum::ClearToZero(Word *w, size_t num_words) {
+        for (size_t i = 0; i < num_words; i++) {
+            w[i] = 0;
+        }
+    }
+
+    void BigNum::SetToWord(Word *w, size_t num_words, Word v) {
+        ClearToZero(w, num_words);
+        w[0] = v;
+    }
+
+    void BigNum::Copy(Word *dst, const Word *src, size_t num_words) {
+        for (size_t i = 0; i < num_words; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    BigNum::Word BigNum::LeftShift(Word *dst, const Word *w, size_t num_words, const size_t shift) {
+        if (shift >= BitsPerWord) {
+            return 0;
+        }
+
+        const size_t invshift = BitsPerWord - shift;
+        Word carry = 0;
+        for (size_t i = 0; i < num_words; i++) {
+            const Word cur = w[i];
+            dst[i] = (cur << shift) | carry;
+            carry = shift ? (cur >> invshift) : 0;
+        }
+
+        return carry;
+    }
+
+    BigNum::Word BigNum::RightShift(Word *dst, const Word *w, size_t num_words, const size_t shift) {
+        if (shift >= BitsPerWord) {
+            return 0;
+        }
+
+        const size_t invshift = BitsPerWord - shift;
+        Word carry = 0;
+        for (s32 i = static_cast<s32>(num_words) - 1; i >= 0; i--) {
+            const Word cur = w[i];
+            dst[i] = (cur >> shift) | carry;
+            carry = shift ? (cur << invshift) : 0;
+        }
+
+        return carry;
+    }
+
+    BigNum::Word BigNum::MultSub(Word *dst, const Word *w, const Word *v, size_t num_words, Word mult) {
+        /* If multiplying by zero, nothing to do. */
+        if (mult == 0) {
+            return 0;
+        }
+
+        Word borrow = 0, work[2];
+        for (size_t i = 0; i < num_words; i++) {
+            /* Multiply, calculate borrow for next. */
+            MultWord(work, mult, v[i]);
+            if ((dst[i] = (w[i] - borrow)) > (MaxWord - borrow)) {
+                borrow = 1;
+            } else {
+                borrow = 0;
+            }
+
+            if ((dst[i] -= work[0]) > (MaxWord - work[0])) {
+                borrow++;
+            }
+            borrow += work[1];
+        }
+
+        return borrow;
+    }
+
+    bool BigNum::ExpMod(Word *dst, const Word *src, const Word *exp, size_t exp_words, const Word *mod, size_t mod_words, WordAllocator *allocator) {
+        /* Nintendo uses an algorithm that relies on powers of exp. */
+        bool needs_exp[4] = {};
+        if (exp_words > 1) {
+            needs_exp[2] = true;
+            needs_exp[3] = true;
+        } else {
+            Word exp_w = exp[0];
+
+            for (size_t i = 0; i < BitsPerWord / 2; i++) {
+                /* Nintendo at each step determines needed exponent from a pair of two bits. */
+                needs_exp[exp_w & 0x3u] = true;
+                exp_w >>= 2;
+            }
+
+            if (needs_exp[3]) {
+                needs_exp[2] = true;
+            }
+        }
+
+        /* Allocate space for powers 1, 2, 3. */
+        auto power_1 = allocator->Allocate(mod_words);
+        auto power_2 = allocator->Allocate(mod_words);
+        auto power_3 = allocator->Allocate(mod_words);
+        if (!(power_1.IsValid() && power_2.IsValid() && power_3.IsValid())) {
+            return false;
+        }
+        decltype(power_1)* powers[3] = { &power_1, &power_2, &power_3 };
+
+        /* Set the powers of src. */
+        Copy(power_1.GetBuffer(), src, mod_words);
+        if (needs_exp[2]) {
+            if (!MultMod(power_2.GetBuffer(), power_1.GetBuffer(), src, mod, mod_words, allocator)) {
+                return false;
+            }
+        }
+        if (needs_exp[3]) {
+            if (!MultMod(power_3.GetBuffer(), power_2.GetBuffer(), src, mod, mod_words, allocator)) {
+                return false;
+            }
+        }
+
+        /* Allocate space to work. */
+        auto work = allocator->Allocate(mod_words);
+        if (!work.IsValid()) {
+            return false;
+        }
+        SetToWord(work.GetBuffer(), work.GetCount(), 1);
+
+        /* Ensure we're working with the correct exponent word count. */
+        exp_words = CountWords(exp, exp_words);
+
+        for (s32 i = static_cast<s32>(exp_words - 1); i >= 0; i--) {
+            Word cur_word = exp[i];
+            size_t cur_bits = BitsPerWord;
+
+            /* Remove leading zeroes in first word. */
+            if (i == static_cast<s32>(exp_words - 1)) {
+                while (!GetTop2Bits(cur_word)) {
+                    cur_word <<= 2;
+                    cur_bits -= 2;
+                }
+            }
+
+            /* Compute current modular multiplicative step. */
+            for (size_t j = 0; j < cur_bits; j += 2, cur_word <<= 2) {
+                /* Exponentiate current work to the 4th power. */
+                if (!MultMod(work.GetBuffer(), work.GetBuffer(), work.GetBuffer(), mod, mod_words, allocator)) {
+                    return false;
+                }
+
+                if (!MultMod(work.GetBuffer(), work.GetBuffer(), work.GetBuffer(), mod, mod_words, allocator)) {
+                    return false;
+                }
+
+                if (const Word top = GetTop2Bits(cur_word)) {
+                    if (!MultMod(work.GetBuffer(), work.GetBuffer(), powers[top - 1]->GetBuffer(), mod, mod_words, allocator)) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        /* Copy work to output. */
+        Copy(dst, work.GetBuffer(), mod_words);
+
+        return true;
+    }
+
+    bool BigNum::MultMod(Word *dst, const Word *src, const Word *mult, const Word *mod, size_t num_words, WordAllocator *allocator) {
+        /* Allocate work. */
+        auto work = allocator->Allocate(2 * num_words);
+        if (!work.IsValid()) {
+            return false;
+        }
+
+        /* Multiply. */
+        if (!Mult(work.GetBuffer(), src, mult, num_words, allocator)) {
+            return false;
+        }
+
+        /* Mod. */
+        if (!Mod(dst, work.GetBuffer(), 2 * num_words, mod, num_words, allocator)) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool BigNum::Mod(Word *dst, const Word *src, size_t src_words, const Word *mod, size_t mod_words, WordAllocator *allocator) {
+        /* Allocate work. */
+        auto work = allocator->Allocate(src_words);
+        if (!work.IsValid()) {
+            return false;
+        }
+
+        if (!DivMod(work.GetBuffer(), dst, src, src_words, mod, mod_words, allocator)) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool BigNum::DivMod(Word *quot, Word *rem, const Word *top, size_t top_words, const Word *bot, size_t bot_words, WordAllocator *allocator) {
+        /* Allocate work. */
+        auto top_work = allocator->Allocate(top_words + 1);
+        auto bot_work = allocator->Allocate(bot_words);
+        if (!(top_work.IsValid() && bot_work.IsValid())) {
+            return false;
+        }
+
+        /* Prevent division by zero. */
+        size_t bot_work_words = CountWords(bot, bot_words);
+        if (bot_work_words == 0) {
+            return false;
+        }
+
+        ClearToZero(quot, top_words);
+        ClearToZero(top_work.GetBuffer(), bot_work_words);
+
+        /* Align to edges. */
+        const size_t shift = BitsPerWord - CountSignificantBits(bot[bot_work_words - 1]);
+        top_work.GetBuffer()[top_words] = LeftShift(top_work.GetBuffer(), top, top_words, shift);
+        LeftShift(bot_work.GetBuffer(), bot, bot_work_words, shift);
+        const Word tb = bot_work.GetBuffer()[bot_work_words - 1];
+
+        /* Repeatedly div + sub. */
+        for (s32 i = (top_words - bot_work_words); i >= 0; i--) {
+            Word cur_word;
+            if (tb == MaxWord) {
+                cur_word = top_work.GetBuffer()[i + bot_work_words];
+            } else {
+                cur_word = DivWord(top_work.GetBuffer() + i + bot_work_words - 1, tb + 1);
+            }
+            top_work.GetBuffer()[i + bot_work_words] -= MultSub(top_work.GetBuffer() + i, top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words, cur_word);
+
+            while (top_work.GetBuffer()[i + bot_work_words] || Compare(top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words) >= 0) {
+                cur_word++;
+                top_work.GetBuffer()[i + bot_work_words] -= Sub(top_work.GetBuffer() + i, top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words);
+            }
+            quot[i] = cur_word;
+        }
+
+        /* Calculate remainder. */
+        ClearToZero(rem, bot_words);
+        RightShift(rem, top_work.GetBuffer(), bot_work_words, shift);
+
+        return true;
+    }
+
+    bool BigNum::Mult(Word *dst, const Word *lhs, const Word *rhs, size_t num_words, WordAllocator *allocator) {
+        /* Allocate work. */
+        auto work = allocator->Allocate(2 * num_words);
+        if (!work.IsValid()) {
+            return false;
+        }
+        ClearToZero(work.GetBuffer(), work.GetCount());
+
+        /* Repeatedly add and multiply. */
+        const size_t lhs_words = CountWords(lhs, num_words);
+        const size_t rhs_words = CountWords(rhs, num_words);
+
+        for (size_t i = 0; i < lhs_words; i++) {
+            work.GetBuffer()[i + rhs_words] += MultAdd(work.GetBuffer() + i, rhs, rhs_words, lhs[i]);
+        }
+
+        /* Copy to output. */
+        Copy(dst, work.GetBuffer(), work.GetCount());
+
+        return true;
+    }
+
+}
\ No newline at end of file
diff --git a/libraries/libvapours/source/crypto/impl/crypto_bignum_operations_asm.arch.arm64.s b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations_asm.arch.arm64.s
new file mode 100644
index 000000000..d7c7ae52b
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations_asm.arch.arm64.s
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ams::crypto::impl::BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
+.section    .text._ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, "ax", %progbits
+.global     _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
+.type       _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, %function
+.balign 0x10
+_ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
+    /* Check if we have anything to do at all. */
+    msr     nzcv, xzr
+    cbz     x3, 7f
+
+    /* Save registers. */
+    stp     x16, x17, [sp, #-16]!
+    stp     xzr, x19, [sp, #-16]!
+    stp     x20, x21, [sp, #-16]!
+
+    /* Check if we have less than 16 words to process. */
+    lsr     x20, x3, #4
+    cbz     x20, 2f
+
+    sub     x3, x3, x20, lsl #4
+
+1:  /* Process 16 words at a time. */
+    /* NOTE: Nintendo uses X18 here, we will use X21 for EL1+ compat. */
+    ldp      x4,  x5, [x1], #16
+    ldp     x12, x13, [x2], #16
+    ldp      x6,  x7, [x1], #16
+    ldp     x14, x15, [x2], #16
+    ldp      x8,  x9, [x1], #16
+    ldp     x16, x17, [x2], #16
+    ldp     x10, x11, [x1], #16
+    ldp     x21, x19, [x2], #16
+
+    adcs    x4, x4, x12
+    adcs    x5, x5, x13
+    stp     x4, x5, [x0], #16
+
+    adcs    x6, x6, x14
+    adcs    x7, x7, x15
+    stp     x6, x7, [x0], #16
+
+    adcs    x8, x8, x16
+    adcs    x9, x9, x17
+    stp     x8, x9, [x0], #16
+
+    adcs    x10, x10, x21
+    adcs    x11, x11, x19
+    stp     x10, x11, [x0], #16
+
+    sub     x20, x20, #1
+    cbnz    x20, 1b
+
+2:  /* We have less than 16 words to process. */
+    lsr     x15, x3, #2
+    cbz     x15, 4f
+
+    sub     x3, x3, x15, lsl #2
+
+3:  /* Process 4 words at a time. */
+    ldp     x4, x5, [x1], #16
+    ldp     x8, x9, [x2], #16
+
+    sub     x15, x15, #1
+
+    adcs    x4, x4, x8
+    adcs    x5, x5, x9
+
+    stp     x4, x5, [x0], #16
+
+    cbnz    x15, 3b
+
+4:  /* We have less than 4 words to process. */
+    cbz     x3, 6f
+
+5:  /* Process 1 word at a time. */
+    ldr     w4, [x1], #4
+    ldr     w8, [x2], #4
+    adcs    w4, w4, w8
+    str     w4, [x0], #4
+
+    sub     x3, x3, #1
+    cbnz    x3, 5b
+
+6:  /* Restore registers we used while adding. */
+    ldp     x20, x21, [sp], #16
+    ldp     xzr, x19, [sp], #16
+    ldp     x16, x17, [sp], #16
+
+7:  /* We're done. */
+    adc     x0, xzr, xzr
+    ret
+
+/* ams::crypto::impl::BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
+.section    .text._ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, "ax", %progbits
+.global     _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
+.type       _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, %function
+.balign 0x10
+_ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
+    /* Check if we have anything to do at all. */
+    mov     x4, #0x20000000
+    msr     nzcv, x4
+    cbz     x3, 7f
+
+    /* Save registers. */
+    stp     x16, x17, [sp, #-16]!
+    stp     xzr, x19, [sp, #-16]!
+    stp     x20, x21, [sp, #-16]!
+
+    /* Check if we have less than 16 words to process. */
+    lsr     x20, x3, #4
+    cbz     x20, 2f
+
+    sub     x3, x3, x20, lsl #4
+
+1:  /* Process 16 words at a time. */
+    /* NOTE: Nintendo uses X18 here, we will use X21 for EL1+ compat. */
+    ldp      x4,  x5, [x1], #16
+    ldp     x12, x13, [x2], #16
+    ldp      x6,  x7, [x1], #16
+    ldp     x14, x15, [x2], #16
+    ldp      x8,  x9, [x1], #16
+    ldp     x16, x17, [x2], #16
+    ldp     x10, x11, [x1], #16
+    ldp     x21, x19, [x2], #16
+
+    sbcs    x4, x4, x12
+    sbcs    x5, x5, x13
+    stp     x4, x5, [x0], #16
+
+    sbcs    x6, x6, x14
+    sbcs    x7, x7, x15
+    stp     x6, x7, [x0], #16
+
+    sbcs    x8, x8, x16
+    sbcs    x9, x9, x17
+    stp     x8, x9, [x0], #16
+
+    sbcs    x10, x10, x21
+    sbcs    x11, x11, x19
+    stp     x10, x11, [x0], #16
+
+    sub     x20, x20, #1
+    cbnz    x20, 1b
+
+2:  /* We have less than 16 words to process. */
+    lsr     x15, x3, #2
+    cbz     x15, 4f
+
+    sub     x3, x3, x15, lsl #2
+
+3:  /* Process 4 words at a time. */
+    ldp     x4, x5, [x1], #16
+    ldp     x8, x9, [x2], #16
+
+    sub     x15, x15, #1
+
+    sbcs    x4, x4, x8
+    sbcs    x5, x5, x9
+
+    stp     x4, x5, [x0], #16
+
+    cbnz    x15, 3b
+
+4:  /* We have less than 4 words to process. */
+    cbz     x3, 6f
+
+5:  /* Process 1 word at a time. */
+    ldr     w4, [x1], #4
+    ldr     w8, [x2], #4
+    sbcs    w4, w4, w8
+    str     w4, [x0], #4
+
+    sub     x3, x3, #1
+    cbnz    x3, 5b
+
+6:  /* Restore registers we used while adding. */
+    ldp     x20, x21, [sp], #16
+    ldp     xzr, x19, [sp], #16
+    ldp     x16, x17, [sp], #16
+
+7:  /* We're done. */
+    cinc    x0, xzr, cc
+    ret
+
+/* ams::crypto::impl::BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) */
+.section    .text._ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, "ax", %progbits
+.global     _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
+.type       _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, %function
+.balign 0x10
+_ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj:
+    /* Check if we have anything to do at all. */
+    mov     x15, xzr
+    cbz     x2, 5f
+
+    /* Check if we have less than four words to process. */
+    lsr     x6, x2, #2
+    cbz     x6, 2f
+
+    /* We have more than four words to process. */
+    sub     x2, x2, x6, lsl #2
+    stp     x16, x17, [sp, #-16]!
+
+1:  /* Loop processing four words at a time. */
+    ldp     w4, w5, [x1], #8
+    ldp     w16, w7, [x1], #8
+    ldp     w8, w9, [x0]
+    ldp     w10, w11, [x0, #8]
+
+    umaddl  x4,  w3, w4,  x8
+    umaddl  x5,  w3, w5,  x9
+    umaddl  x16, w3, w16, x10
+    umaddl  x7,  w3, w7,  x11
+
+    add     x12, x4, x15, lsr #32
+    add     x13, x5, x12, lsr #32
+    stp     w12, w13, [x0], #8
+
+    add     x14, x16, x13, lsr #32
+    add     x15, x7, x14, lsr #32
+    stp     w14, w15, [x0], #8
+
+    sub     x6, x6, #1
+    cbnz    x6, 1b
+
+    ldp     x16, x17, [sp], #16
+
+2:  /* We have less than four words. Check if we have less than two. */
+    lsr     x6, x2, #1
+    cbz     x6, 4f
+
+    /* We have more than two words to process. */
+    sub     x2, x2, x6, lsl #1
+
+3:  /* Loop processing two words at a time. */
+    ldp     w4, w5, [x1], #8
+    ldp     w8, w9, [x0]
+
+    umaddl  x4, w3, w4, x8
+    umaddl  x5, w3, w5, x9
+
+    sub     x6, x6, #1
+
+    add     x14, x4, x15, lsr #32
+    add     x15, x5, x14, lsr #32
+
+    stp     w14, w15, [x0], #8
+
+    cbnz    x6, 3b
+
+4:  /* We have less than two words to process. */
+    cbz     x2, 5f
+
+    /* We have one word to process. */
+    ldr     w4, [x1], #4
+    ldr     w8, [x0]
+
+    umaddl  x4, w3, w4, x8
+    add     x15, x4, x15, lsr #32
+
+    str     w15, [x0], #4
+
+5:  /* We're done. */
+    lsr     x0, x15, #32
+    ret