Arbitrary Sort Orders in Python
I wrote this little tidbit of code in response to constant sorting irritations when I was using other support Python code I use in conlanging. The full story is on my blog.
import re class ArbSorter: def __init__(self, order): elts = re.split('\s*', order, flags=re.UNICODE) # Create a regex to split on each character or multicharacter # sort key. (As in "ch" after all "c"s, for example.) # Gosh, this is not especially efficient, but it works. split_order = sorted(elts, key=len, reverse=True) self.splitter = re.compile(u"(%s)" % "|".join(split_order), re.UNICODE) # Next, collect weights for the ordering. self.ords = {} self.vals = [] for i in range(len(elts)): self.ords[elts[i]] = i self.vals.append(elts[i]) # Turns a word into a list of ints representing the new # lexicographic ordering. Python, helpfully, allows one to # sort ordered collections of all types, including lists. def word_as_values(self, word): w = self.splitter.split(word)[1::2] return [self.ords[char] for char in w] def values_as_word(self, values): return "".join([self.vals[v] for v in values]) def __call__(self, l): l2 = [self.word_as_values(item) for item in l] l2.sort() return [self.values_as_word(item) for item in l2] if __name__ == '__main__': mysorter = ArbSorter(u"a á c ch e h i k l m n ng o p r s t u") m = u"chica ciha no áru ngo na nga sangal ahi ná mochi moco" s = mysorter(m.split()) print " ".join(s).encode('utf-8')