Monday, October 10, 2011

String-matching automaton

static void Main(string[] args)
{
    if (args.Length < 2)
    {

        Console.WriteLine("Usage: {0} string_pattern string",
            Environment.GetCommandLineArgs()[0]);
        return;
    }

    string pattern = args[0].ToUpper();
    string str = args[1].ToUpper();

    // The automaton. Simple state machine that contains back-links to self-repeating
    // patterns inside the pattern. This takes a long time to compute:
    // O(pattern size cubed times size of the alphabet). This makes this algorithm
    // not very practical in general.

    const int alphabetLen = 'Z' - 'A' + 1;
    int [,] transitionFunc = new int[alphabetLen, pattern.Length];
    for (int stateIndex = 0; stateIndex < pattern.Length; ++stateIndex)
    {
        for (int ch = 0; ch < alphabetLen; ++ch)
        {
            string suffix = pattern.Substring(0, stateIndex) + (char)('A' + ch);
            int stateTransition = Math.Min(pattern.Length + 1, stateIndex + 2);
            string prefix;
            do
            {
                --stateTransition;
                prefix = pattern.Substring(0, stateTransition);
            }
            while (!suffix.EndsWith(prefix));
            transitionFunc[ch, stateIndex] = stateTransition;
        }
    }

    // The match.
    int state = 0;
    for (int i = 0; i < str.Length; ++i)
    {
        state = transitionFunc[str[i] - 'A', state];

        if (state == pattern.Length)
        {
            Console.WriteLine("Substring found at " + (i - pattern.Length + 1));
            break;
        }
    }
}

No comments: