/*
 * 
 * coded.c
 *
 * Breaks simple substitution ciphers using dictionary search.
 * Very fast!
 * Not very helpful commenting, though.
 *
 * Copyright (C)2005 Roger Nesbitt.
 * http://moge.st/
 * Free for non-commercial use.
 * 
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define DICTIONARY_FILE "/usr/dict/words"

struct dict_entry
{
	char word;
	struct dict_entry *children[128];
};

struct linked_list
{
	char *data;
	struct linked_list *next;
};
	

// This function loads the system dictionary into a tree, one character per node.
int load_dictionary(struct dict_entry *dictionary)
{
	FILE *f;
	char buf[4096];
	struct dict_entry *p;
	int i, count, skip;;

	f = fopen(DICTIONARY_FILE, "r");
	if (!f) return -1;

	count = 0;
	while (fgets(buf, 4096, f))
	{
		p = dictionary;
		skip = 0;
		for (i=0; i<strlen(buf); i++)
		{
			if (buf[i] == '\r' || buf[i] == '\n') break;
			if ((unsigned char)buf[i] > 127)
			{
				skip = 1;
				continue;
			}

			if (p->children[(int)buf[i]] == NULL)
			{
				p->children[(int)buf[i]] = (struct dict_entry *)malloc(sizeof(struct dict_entry));
				p = p->children[(int)buf[i]];
				if (p == NULL) return -1;
				memset(p, 0, sizeof(struct dict_entry));
			}
			else
			{
				p = p->children[(int)buf[i]];
			}
		}
		
		if (!skip)
		{
			p->word = 1;
			count++;
		}
	}

	fclose(f);
	return count;
}

// This function splits a string into separate words.
int split(char *s, char ***wordsp)
{
	int i, len, count = 2;
	char *start, *p, **words;

	for (i=0; i<strlen(s); i++) if (s[i] == ' ') count++;
	words = *wordsp = (char **)malloc(sizeof(char **) * count);
	if (words == NULL) return -1;
	
	count = 0;
	start = p = s;
	while (1)
	{
		if (*p == ' ' || *p == 0)
		{
			len = (*p == 0) ? p - start : p - start;
			words[count] = (char *)malloc(len + 1);
			if (words[count] == NULL) return -1;
			strncpy(words[count], start, len);
			words[count][len] = 0;
			count++;
			start = p + 1;
		}

		if (!*p) break;
		p++;
	}
	
	words[count] = NULL;
	return 0;
}

int find(char **words, struct dict_entry *dictionary, char *pforward, char *preverse, struct linked_list **presult)
{
	char *word = words[0];
	struct dict_entry **dp, **dpp;
	unsigned char *testing, *tpp, *found, *fpp;
	char *p;
	int i, die = 0;
	char *forward, *reverse;
	struct linked_list *result, *cresult, *ll;

	// result holds the matching words.
	result = *presult = NULL;

	// forward and reverse keep track of the character translations between the original word and the
	// matching dictionary words.  We make a copy of them here so we can pass them recursively back
	// to ourselves for checking subsequent words.
	forward = (char *)malloc(128);
	reverse = (char *)malloc(128);
	if (forward == NULL || reverse == NULL)
	{
		fprintf(stderr, "malloc() out of memory on forward/reverse allocate.\n");
		return -1;
	}
	memcpy(forward, pforward, 128);
	memcpy(reverse, preverse, 128);

	// dp is a list of pointers to directory entries.  The first letter (dp[0]) points to the
	// directory root, the second letter (dp[1]) points to the tree node that corresponds to the
	// first letter in, and so forth.  This list is used so we can backtrack when we run into
	// a word that doesn't exist.
	dp = (struct dict_entry **)malloc(sizeof(struct dict_entry *) * strlen(word) + 2);
	if (dp == NULL)
	{
		fprintf(stderr, "malloc() out of memory on dp allocate.\n");
		return -1;
	}
	
	// found stores the current string we're testing to see if it matches the word.
	found = (char *)malloc(sizeof(char) * strlen(word) + 2);
	if (found == NULL)
	{
		fprintf(stderr, "malloc() out of memory on testing allocate.\n");
		return -1;
	}
	memset(found, 0, strlen(word) + 2);

	// testing is much like found, but we store the value 128 in characters that cannot
	// be further tested (because we've tested all of them already, or because a forward
	// lookup was used to match.)
	testing = (char *)malloc(sizeof(char) * strlen(word) + 2);
	if (testing == NULL)
	{
		fprintf(stderr, "malloc() out of memory on testing allocate.\n");
		return -1;
	}
	memset(testing, 0, strlen(word) + 2);


	dp[0] = dictionary;
	testing[0] = 32;
	dpp = dp;
	fpp = found;
	tpp = testing;
	
	//printf("Matching word is %s.\n", word);
	p = word;
	while (*p && !die)
	{
		//printf("Starting test at '%s'\n", testing);
		
		// Has this letter already been used before?  If so, forward[] will be set with the character
		// and we MUST match the character previously chosen.
		if (*tpp != 128 && forward[(int)*p])
		{
			if ((*dpp)->children[(int)forward[(int)*p]] == NULL) 
			{
				*tpp = 128;
				//printf("dictionary dead end, path %s%c doesn't exist\n", found, forward[(int)*p]);
			}
			else
			{
				*tpp = 128; // no matter what, because we've finished searching on this character
				*fpp = forward[(int)*p];
				dpp[1] = (*dpp)->children[(int)*fpp];
				dpp++;
				tpp++;
				fpp++;
				*tpp = 32;
				p++;
			}
		}
		else // we haven't seen this character before.  Scan all the letters to find one that may match.
		{
			for (; *tpp<128; (*tpp)++)
			{
				if (reverse[(int)*tpp]) continue;
				if ((*dpp)->children[(int)*tpp] == NULL) continue;
				if (p[1] == 0 && (*dpp)->children[*tpp]->word == 0) continue;
				forward[(int)*p] = *tpp;
				reverse[(int)*tpp] = *p;
				dpp[1] = (*dpp)->children[*tpp];
				dpp++;
				*fpp = *tpp;
				fpp++;
				tpp++;
				*tpp = 32;
				p++;
				break;
			}
		}
		
		if (*tpp == 128 || *p == 0)
		{
			// Check to see if we've reached the end of the matching string and we've found a dictionary word.
			if (*p == 0 && (*dpp)->word == 1)
			{
				*fpp = 0;
				if (words[1] != NULL)
				{
					// Recurse to find the next words with our forward and reverse set.
					if (find(&words[1], dictionary, forward, reverse, &cresult) == -1) return -1;
					while (cresult)
					{
						if (result == NULL)
							result = *presult = (struct linked_list *)malloc(sizeof(struct linked_list));
						else
						{
							result->next = (struct linked_list *)malloc(sizeof(struct linked_list));
							result = result->next;
						}
						
						if (result == NULL)
						{
							fprintf(stderr, "Out of memory building linked list.\n");
							return -1;
						}
							
						result->data = (char *)malloc(strlen(found) + strlen(cresult->data) + 1 + 1);
						if (result->data == NULL)
						{
							fprintf(stderr, "Out of memory allocating result string\n");
							return -1;
						}
						sprintf(result->data, "%s %s", found, cresult->data);
						result->next = NULL;
						ll = cresult;
						cresult = cresult->next;
						free(ll);
					}						
				}
				else
				{
					if (result == NULL)
						result = *presult = (struct linked_list *)malloc(sizeof(struct linked_list));
					else
					{
						result->next = (struct linked_list *)malloc(sizeof(struct linked_list));
						result = result->next;
					}
					
					if (result == NULL)
					{
						fprintf(stderr, "Out of memory building linked list.\n");
						return -1;
					}
						
					result->data = (char *)malloc(strlen(found) + 1);
					if (result->data == NULL)
					{
						fprintf(stderr, "Out of memory allocating result string\n");
						return -1;
					}
					strcpy(result->data, found);
					result->next = NULL;
				}

				//printf("Word found is %s.\n", found);
			}
				
			// Check to see if we've reached the beginning of the word: if so, we haven't found a match.
			if (p == word)
			{
				die = 1;
				break;
			}
			
			if (*tpp == 128) *fpp = 0;
			tpp--;
			dpp--;
			fpp--;
			p--;
			if (*tpp < 128) (*tpp)++;

			// Rebuild the forward/reverse arrays so we can test the next character.
			memcpy(forward, pforward, 128);
			memcpy(reverse, preverse, 128);
			for (i=0; i<fpp - found; i++)
			{
				forward[(int)word[i]] = found[i];
				reverse[(int)found[i]] = word[i];
			}

			//printf("backtrack, next test is %c\n", *tpp);
		}	
	}
	
	free(found);
	free(testing);
	free(dp);
	free(reverse);
	free(forward);

	return 0;
}


int main(int argc, char *argv[])
{
	char **words;
	struct dict_entry dictionary;
	int dict_count;
	char forward[128], reverse[128];
	struct linked_list *result;
	
	memset(&dictionary, 0, sizeof(dictionary));
	memset(forward, 0, 128);
	memset(reverse, 0, 128);
			
	if (argc != 2)
	{
		fprintf(stderr, "usage: %s \"string to decrypt\"\n", argv[0]);
		return 1;
	}

	///printf("Loading dictionary...\n");
	if ( (dict_count = load_dictionary(&dictionary)) < 1 )
	{
		fprintf(stderr, "can't load dictionary\n");
		return 2;
	}

	//printf("testing, 'c' is in the tree = %p\n", dictionary.children['c']);
	//printf("testing, 'co' is in the tree = %p\n", dictionary.children['c']->children['o']);
	//printf("testing, 'cow' is a word = %d\n", dictionary.children['c']->children['o']->children['w']->word);

	//printf("Splitting...\n");
	if (split(argv[1], &words) == -1)
	{
		fprintf(stderr, "can't split words\n");
		return 3;
	}

	//printf("Finding...\n");
	find(words, &dictionary, forward, reverse, &result);

	while (result)
	{
		puts(result->data);
		result = result->next;
	}

	return 0;
}

