Algorithms-UCSanDiego

Data Structures and Algorithms by UCSanDiego

View project on GitHub

4. Suffix Array

Contents

Lecture

Memory Efficient Data Structure

Memory Storage Problem

Quiz

Memory Storage Solution

Suffix Array Order

Use the Burrows-Wheeler Transform Matrix (i.e. the sorted Cyclic Rotations) to find and order all suffixes of the original string ababaa$:

$ababaa
a$ababa
aa$abab
abaa$ab
ababaa$
baa$aba
babaa$a

Remove all characters after the $ delimiter and only the ordered suffixes remain:

$
a$
aa$
abaa$
ababaa$
baa$
babaa$

The Suffix Array Order is the begin index of each of the ordered suffixes in the original string:


ababaa$
01234567

order   suffix
-----   ------
        $
        a$
        aa$
        abaa$
        ababaa$
        baa$
        babaa$

      v
ababaa$
01234567
      ^

order   suffix
-----   ------
    6   $
        a$
        aa$
        abaa$
        ababaa$
        baa$
        babaa$

     vv
ababaa$
01234567
     ^^ 

order   suffix
-----   ------
    6   $
    5   a$
        aa$
        abaa$
        ababaa$
        baa$
        babaa$

    vvv
ababaa$
01234567
    ^^^

order   suffix
-----   ------
    6   $
    5   a$
    4   aa$
        abaa$
        ababaa$
        baa$
        babaa$

  vvvvv
ababaa$
01234567
  ^^^^^    

order   suffix
-----   ------
    6   $
    5   a$
    4   aa$
    2   abaa$
        ababaa$
        baa$
        babaa$

vvvvvvv
ababaa$
01234567
^^^^^^^    

order   suffix
-----   ------
    6   $
    5   a$
    4   aa$
    2   abaa$
    0   ababaa$
        baa$
        babaa$

   vvvv
ababaa$
01234567
   ^^^^    

order   suffix
-----   ------
    6   $
    5   a$
    4   aa$
    2   abaa$
    0   ababaa$
    3   baa$
        babaa$

 vvvvvv
ababaa$
01234567
 ^^^^^^    

order   suffix
-----   ------
    6   $
    5   a$
    4   aa$
    2   abaa$
    0   ababaa$
    3   baa$
    1   babaa$

General Strategy for Construction

Quiz

Partial Cyclic Shifts

Partial Cyclic Shift Strategy

Partial Cyclic Shift Example

Initialization

Counting Sort

Equivalence Classes

Sort Doubled Cyclic Shifts

Example

The following slides appear to have incorrect values for C6' and C3'. I believe the values in yellow are reversed:

Incorrect values?

C6' = $aba
C3' = baba

Correct values?

C6' = baba
C3' = $aba

Stable Sort

Required for C0' and C2' to remain in sorted order since we are only sorting my the first half and the second half is already sorted:

Pseudocode

Quiz

    For this quiz, calculate start[ i ]:

    start <- ( order[ i ] - L + |S| ) mod |S|
    
    
    S = ababaa$
    
    ( recall: Suffix Array 'order' == begin index of the ordered suffixes of the original string S )
    
    order   suffix    
    -----   ------
        6   $
        5   a$
        4   aa$
        2   abaa$
        0   ababaa$
        3   baa$
        1   babaa$
    
      L = 2
    |S| = 7
    
    ( order[ 6 ] - L + |S| ) % |S| == ( 1 - 2 + 7 ) % 7 == 6
    ( order[ 5 ] - L + |S| ) % |S| == ( 3 - 2 + 7 ) % 7 == 1
    ( order[ 4 ] - L + |S| ) % |S| == ( 0 - 2 + 7 ) % 7 == 5
    ( order[ 3 ] - L + |S| ) % |S| == ( 2 - 2 + 7 ) % 7 == 0
    ( order[ 2 ] - L + |S| ) % |S| == ( 4 - 2 + 7 ) % 7 == 2
    ( order[ 1 ] - L + |S| ) % |S| == ( 5 - 2 + 7 ) % 7 == 3
    ( order[ 0 ] - L + |S| ) % |S| == ( 6 - 2 + 7 ) % 7 == 4
                                                           ^
                                                           ^
    start[ i ] = [6,1,5,0,2,3,4] > > > > > > > > > > > > > ^
             i =  6 5 4 3 2 1 0
    

Updating Equivalence Classes

Quiz

FAQ

Final Quiz

Problem

Summary of Pseudocode

Solutions

CPP Main Only

    #include <iomanip>
    #include <iostream>
    #include <sstream>
    #include <algorithm>
    #include <iterator>
    #include <vector>
    #include <string>
    #include <cassert>
    
    using namespace std;
    using Collection = vector< int >;
    using Count = Collection;
    using Class = Collection;
    using Order = Collection;
    int main() {
        string S; cin >> S;
        const auto N = static_cast< int >( S.size() );
        Order order( N ); { // order <- SortCharacters( S )
            const auto M = 'T' + 1; // alphabet size
            Count cnt( M, 0 );
            for( auto i{ 0 }; i < N; ++i ){
                auto ch = S[ i ];
                ++cnt[ ch ]; // count of each unique char
            }
            for( auto j{ 1 }; j < M; ++j )
                cnt[ j ] += cnt[ j-1 ]; // prefix sums ( i.e. one-past the end of each unique char's index range )
            for( auto i{ N-1 }; 0 <= i; --i ){
                auto ch = S[ i ];
                --cnt[ ch ];
                order[ cnt[ ch ] ] = i;
            }
        }
        Class classes( N ); { // classes <- ComputeCharClasses( S, order )
            classes[ order[ 0 ] ] = 0;
            for( auto i{ 1 }; i < N; ++i )
                if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
                    classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1; // diff equivalence class
                else
                    classes[ order[ i ] ] = classes[ order[ i-1 ] ];     // same equivalence class
        }
        for( auto L{ 1 }; L < N; L *= 2 ){ // order <- SortDoubled( S, L, order, classes )
            Count cnt( N, 0 ); {
                for( auto i{ 0 }; i < N; ++i ){
                    auto cl = classes[ i ];
                    ++cnt[ cl ];
                }
                for( auto j{ 1 }; j < N; ++j )
                    cnt[ j ] += cnt[ j-1 ];
            }
            Order next_order( N ); {
                for( auto i{ N-1 }; 0 <= i; --i ){
                    auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
                    auto cl = classes[ start ];
                    --cnt[ cl ];
                    next_order[ cnt[ cl ] ] = start;
                }
            }
            order.swap( next_order );
            Class next_classes( N ); { // UpdateClasses( L, next_order, classes )
                next_classes[ order[ 0 ] ] = 0;
                for( auto i{ 1 }; i < N; ++i ){
                    auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
                         cur = order[ i   ], curMid = ( cur + L ) % N;
                    if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
                        next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
                    else
                        next_classes[ cur ] = next_classes[ pre ];     // same equivalence class
                }
            }
            classes.swap( next_classes );
        }
        copy_n( order.begin(), N, ostream_iterator< int >( cout, " " ) ); cout << endl;
        return 0;
    }

CPP Functions

    #include <iostream>
    #include <sstream>
    #include <algorithm>
    #include <iterator>
    #include <vector>
    #include <string>
    
    using namespace std;
    using VI = vector< int >;
    
    VI sortChars( const string& S, const int N ){
        const int M = 'T' + 1; // alphabet size
        VI order( N ), cnt( M );
        for( auto i{ 0 }; i < N; ++i ){
            auto c = S[ i ];
            ++cnt[ c ];
        }
        for( auto j{ 1 }; j < M; ++j )
            cnt[ j ] += cnt[ j-1 ];
        for( auto i{ N-1 }; 0 <= i; --i ){
            auto c = S[ i ];
            --cnt[ c ];
            order[ cnt[ c ] ] = i;
        }
        return order;
    }
    
    VI computeCharClasses( const string& S, const int N, const VI& order ){
        VI classes( N, 0 );
        for( auto i{ 1 }; i < N; ++i )
            if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
                classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
            else
                classes[ order[ i ] ] = classes[ order[ i-1 ] ];
        return classes;
    }
    
    VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
        VI next_order( N, 0 ), cnt( N, 0 );
        for( auto i{ 0 }; i < N; ++i ){
            auto cl = classes[ i ];
            ++cnt[ cl ];
        }
        for( auto j{ 1 }; j < N; ++j )
            cnt[ j ] += cnt[ j-1 ];
        for( auto i{ N-1 }; 0 <= i; --i ){
            auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
            auto cl = classes[ start ];
            --cnt[ cl ];
            next_order[ cnt[ cl ] ] = start;
        }
        return next_order;
    }
    
    VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
        VI next_classes( N, 0 );
        for( auto i{ 1 }; i < N; ++i ){
            auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
                cur = order[ i   ], curMid = ( cur + L ) % N;
            if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
                next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
            else
                next_classes[ cur ] = next_classes[ pre ];     // same equivalence class
        }
        return next_classes;
    }
    
    VI buildSuffixArray( const string& S, const int N ){
        auto order = sortChars( S, N );
        auto classes = computeCharClasses( S, N, order );
        for( auto L{ 1 }; L < N; L *= 2 ){
            order = sortDoubled( S, N, L, order, classes );
            classes = updateClasses( N, L, order, classes );
        }
        return order;
    }
    
    int main() {
        string S; cin >> S;
        const auto N = static_cast< int >( S.size() );
        auto order = buildSuffixArray( S, N );
        copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
        return 0;
    }

CPP Object

    #include <iomanip>
    #include <iostream>
    #include <sstream>
    #include <algorithm>
    #include <iterator>
    #include <vector>
    #include <string>
    
    using namespace std;
    using VI = vector< int >;
    
    class SuffixArray {
    public:
        using VI = vector< int >;
        static VI build( const string& S, const int N ){
            auto order = sortChars( S, N );
            auto classes = computeCharClasses( S, N, order );
            for( auto L{ 1 }; L < N; L *= 2 ){
                order = sortDoubled( S, N, L, order, classes );
                classes = updateClasses( N, L, order, classes );
            }
            return order;
        }
    private:
        static VI sortChars( const string& S, const int N ){
            const int M = 'T' + 1; // alphabet size
            VI order( N ), cnt( M );
            for( auto i{ 0 }; i < N; ++i ){
                auto c = S[ i ];
                ++cnt[ c ];
            }
            for( auto j{ 1 }; j < M; ++j )
                cnt[ j ] += cnt[ j-1 ];
            for( auto i{ N-1 }; 0 <= i; --i ){
                auto c = S[ i ];
                --cnt[ c ];
                order[ cnt[ c ] ] = i;
            }
            return order;
        }
        static VI computeCharClasses( const string& S, const int N, const VI& order ){
            VI classes( N, 0 );
            for( auto i{ 1 }; i < N; ++i )
                if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
                    classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
                else
                    classes[ order[ i ] ] = classes[ order[ i-1 ] ];
            return classes;
        }
        static VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
            VI next_order( N, 0 ), cnt( N, 0 );
            for( auto i{ 0 }; i < N; ++i ){
                auto cl = classes[ i ];
                ++cnt[ cl ];
            }
            for( auto j{ 1 }; j < N; ++j )
                cnt[ j ] += cnt[ j-1 ];
            for( auto i{ N-1 }; 0 <= i; --i ){
                auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
                auto cl = classes[ start ];
                --cnt[ cl ];
                next_order[ cnt[ cl ] ] = start;
            }
            return next_order;
        }
        static VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
            VI next_classes( N, 0 );
            for( auto i{ 1 }; i < N; ++i ){
                auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
                    cur = order[ i   ], curMid = ( cur + L ) % N;
                if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
                    next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
                else
                    next_classes[ cur ] = next_classes[ pre ];     // same equivalence class
            }
            return next_classes;
        }
    };
    
    int main() {
        string S; cin >> S;
        const auto N = static_cast< int >( S.size() );
        auto order = SuffixArray::build( S, N );
        copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
        return 0;
    }