toki-pona-rhymes/gen_rhymes.py

#!/usr/bin/env python3

import sys as trans
from pprint import pp

# We're taking advantage of the fact that toki pona pronounciation is very
# orderly. We can determine rhymes entirely orthographically.
VOWELS = ["a", "e", "i", "o", "u"]

words = []
rhyme_classes = {}
rhymes = {}

def matching_substring_r(str1, str2):
    """Return the longest matching substring of two strings which touches the
    end of both."""
    shortest_len = min(len(str1), len(str2))
    if shortest_len == 0:
        return ""

    current_match = ""
    current_index = -1
    while shortest_len + (current_index + 1) > 0:
        if str1[current_index] == str2[current_index]:
            current_match = str1[current_index:]
        else:
            break
        current_index -= 1
    return current_match

# Need a dict file to work from
if not len(trans.argv) == 2:
    print("Please provide a single dictionary file to work from.")
    trans.exit(1)

# Read words, one per line
with open(trans.argv[1]) as dictionary:
    for line in dictionary.readlines():
        word = line.rstrip("\n")
        words.append(word)

print("Read " + str(len(words)) + " words.")

for word in words:
    rhymes[word] = []

    for possible_rhyme in words:
        # lili doesn't rhyme with lili
        if word == possible_rhyme:
            continue

        # find the rhyme class represented by this relationship,
        # which is a nonempty substring touching the end of the words
        # which starts with a vowel
        rhyme_class = matching_substring_r(word, possible_rhyme)
        while len(rhyme_class) > 1 and\
                rhyme_class[0] not in VOWELS:
                    rhyme_class = rhyme_class[1:]

        # i don't care that lili rhymes with monsi, that's too much
        if not len(rhyme_class) > 1:
            continue

        # we know that these two rhyme
        rhymes[word].append(possible_rhyme)

        if rhyme_class not in rhyme_classes:
            rhyme_classes[rhyme_class] = []

        # we add only this word; the other will be added in its own outer loop
        if word not in rhyme_classes[rhyme_class]:
            rhyme_classes[rhyme_class].append(word)

rhyme_classes_list = list(rhyme_classes.keys())
rhyme_classes_list.sort()
rhymes_list = list(rhymes.keys())
rhymes_list.sort()
unrhymed_words = list(filter(lambda word: len(rhymes[word]) == 0, rhymes.keys()))
unrhymed_words.sort()

print("Discovered " + str(len(rhyme_classes_list)) + " rhyme classes.")
print("Rhyme classes: " + ", ".join(rhyme_classes_list))
print("\nRhymes")
for word in rhymes_list:
    rhymes_for_word_list = rhymes[word]
    rhymes_for_word_list.sort()
    if len(rhymes_for_word_list) > 0:
        print(word + ": " + ", ".join(rhymes_for_word_list))

print("\nUnrhymed words: " + ", ".join(unrhymed_words))