Arbitrary Sort Orders in Python

I wrote this little tidbit of code in response to constant sorting irritations when I was using other support Python code I use in conlanging. The full story is on my blog.

import re

class ArbSorter:
    def __init__(self, order):
        elts = re.split('\s*', order, flags=re.UNICODE)
        # Create a regex to split on each character or multicharacter
        # sort key.  (As in "ch" after all "c"s, for example.)
        # Gosh, this is not especially efficient, but it works.
        split_order = sorted(elts, key=len, reverse=True)
        self.splitter = re.compile(u"(%s)" % "|".join(split_order), re.UNICODE)
        # Next, collect weights for the ordering.
        self.ords = {}
        self.vals = []
        for i in range(len(elts)):
            self.ords[elts[i]] = i
            self.vals.append(elts[i])

    # Turns a word into a list of ints representing the new
    # lexicographic ordering.  Python, helpfully, allows one to
    # sort ordered collections of all types, including lists.
    def word_as_values(self, word):
        w = self.splitter.split(word)[1::2]
        return [self.ords[char] for char in w]

    def values_as_word(self, values):
        return "".join([self.vals[v] for v in values])

    def __call__(self, l):
        l2 = [self.word_as_values(item) for item in l]
        l2.sort()
        return [self.values_as_word(item) for item in l2]

if __name__ == '__main__':
    mysorter = ArbSorter(u"a á c ch e h i k l m n ng o p r s t u")
    m = u"chica ciha no áru ngo na nga sangal ahi ná mochi moco"
    s = mysorter(m.split())
    print " ".join(s).encode('utf-8')

Copyright (c) 2006-2017 William S. Annis