4. Suffix Array
Contents
Lecture
Memory Efficient Data Structure

Memory Storage Problem

Quiz

Memory Storage Solution

Suffix Array Order
Use the Burrows-Wheeler Transform Matrix
(i.e. the sorted Cyclic Rotations) to find and order all suffixes of the original string ababaa$:
$ababaa
a$ababa
aa$abab
abaa$ab
ababaa$
baa$aba
babaa$a
Remove all characters after the $ delimiter and only the ordered suffixes remain:
$
a$
aa$
abaa$
ababaa$
baa$
babaa$
The Suffix Array Order is the begin index of each of the ordered suffixes in the original string:

ababaa$
01234567
order suffix
----- ------
$
a$
aa$
abaa$
ababaa$
baa$
babaa$

v
ababaa$
01234567
^
order suffix
----- ------
6 $
a$
aa$
abaa$
ababaa$
baa$
babaa$

vv
ababaa$
01234567
^^
order suffix
----- ------
6 $
5 a$
aa$
abaa$
ababaa$
baa$
babaa$

vvv
ababaa$
01234567
^^^
order suffix
----- ------
6 $
5 a$
4 aa$
abaa$
ababaa$
baa$
babaa$

vvvvv
ababaa$
01234567
^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
ababaa$
baa$
babaa$

vvvvvvv
ababaa$
01234567
^^^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
baa$
babaa$

vvvv
ababaa$
01234567
^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
babaa$

vvvvvv
ababaa$
01234567
^^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
1 babaa$

General Strategy for Construction

Quiz

Partial Cyclic Shifts

Partial Cyclic Shift Strategy

Partial Cyclic Shift Example

Initialization
Counting Sort

Equivalence Classes

Sort Doubled Cyclic Shifts

Example

The following slides appear to have incorrect values for C6' and C3'. I believe the values
in yellow are reversed:
Incorrect values?
C6' = $aba
C3' = baba
Correct values?
C6' = baba
C3' = $aba

Stable Sort
Required for C0' and C2' to remain in sorted order since we are only sorting my the first half
and the second half is already sorted:

Pseudocode

Quiz

For this quiz, calculate start[ i ]:
start <- ( order[ i ] - L + |S| ) mod |S|
S = ababaa$
( recall: Suffix Array 'order' == begin index of the ordered suffixes of the original string S )
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
1 babaa$
L = 2
|S| = 7
( order[ 6 ] - L + |S| ) % |S| == ( 1 - 2 + 7 ) % 7 == 6
( order[ 5 ] - L + |S| ) % |S| == ( 3 - 2 + 7 ) % 7 == 1
( order[ 4 ] - L + |S| ) % |S| == ( 0 - 2 + 7 ) % 7 == 5
( order[ 3 ] - L + |S| ) % |S| == ( 2 - 2 + 7 ) % 7 == 0
( order[ 2 ] - L + |S| ) % |S| == ( 4 - 2 + 7 ) % 7 == 2
( order[ 1 ] - L + |S| ) % |S| == ( 5 - 2 + 7 ) % 7 == 3
( order[ 0 ] - L + |S| ) % |S| == ( 6 - 2 + 7 ) % 7 == 4
^
^
start[ i ] = [6,1,5,0,2,3,4] > > > > > > > > > > > > > ^
i = 6 5 4 3 2 1 0
Updating Equivalence Classes

Quiz

FAQ
Final Quiz

Problem

Summary of Pseudocode

Solutions
CPP Main Only
#include <iomanip>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
#include <cassert>
using namespace std;
using Collection = vector< int >;
using Count = Collection;
using Class = Collection;
using Order = Collection;
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
Order order( N ); { // order <- SortCharacters( S )
const auto M = 'T' + 1; // alphabet size
Count cnt( M, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto ch = S[ i ];
++cnt[ ch ]; // count of each unique char
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ]; // prefix sums ( i.e. one-past the end of each unique char's index range )
for( auto i{ N-1 }; 0 <= i; --i ){
auto ch = S[ i ];
--cnt[ ch ];
order[ cnt[ ch ] ] = i;
}
}
Class classes( N ); { // classes <- ComputeCharClasses( S, order )
classes[ order[ 0 ] ] = 0;
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1; // diff equivalence class
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ]; // same equivalence class
}
for( auto L{ 1 }; L < N; L *= 2 ){ // order <- SortDoubled( S, L, order, classes )
Count cnt( N, 0 ); {
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
}
Order next_order( N ); {
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
}
order.swap( next_order );
Class next_classes( N ); { // UpdateClasses( L, next_order, classes )
next_classes[ order[ 0 ] ] = 0;
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
}
classes.swap( next_classes );
}
copy_n( order.begin(), N, ostream_iterator< int >( cout, " " ) ); cout << endl;
return 0;
}
CPP Functions
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
using namespace std;
using VI = vector< int >;
VI sortChars( const string& S, const int N ){
const int M = 'T' + 1; // alphabet size
VI order( N ), cnt( M );
for( auto i{ 0 }; i < N; ++i ){
auto c = S[ i ];
++cnt[ c ];
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto c = S[ i ];
--cnt[ c ];
order[ cnt[ c ] ] = i;
}
return order;
}
VI computeCharClasses( const string& S, const int N, const VI& order ){
VI classes( N, 0 );
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ];
return classes;
}
VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
VI next_order( N, 0 ), cnt( N, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
return next_order;
}
VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
VI next_classes( N, 0 );
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
return next_classes;
}
VI buildSuffixArray( const string& S, const int N ){
auto order = sortChars( S, N );
auto classes = computeCharClasses( S, N, order );
for( auto L{ 1 }; L < N; L *= 2 ){
order = sortDoubled( S, N, L, order, classes );
classes = updateClasses( N, L, order, classes );
}
return order;
}
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
auto order = buildSuffixArray( S, N );
copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
return 0;
}
CPP Object
#include <iomanip>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
using namespace std;
using VI = vector< int >;
class SuffixArray {
public:
using VI = vector< int >;
static VI build( const string& S, const int N ){
auto order = sortChars( S, N );
auto classes = computeCharClasses( S, N, order );
for( auto L{ 1 }; L < N; L *= 2 ){
order = sortDoubled( S, N, L, order, classes );
classes = updateClasses( N, L, order, classes );
}
return order;
}
private:
static VI sortChars( const string& S, const int N ){
const int M = 'T' + 1; // alphabet size
VI order( N ), cnt( M );
for( auto i{ 0 }; i < N; ++i ){
auto c = S[ i ];
++cnt[ c ];
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto c = S[ i ];
--cnt[ c ];
order[ cnt[ c ] ] = i;
}
return order;
}
static VI computeCharClasses( const string& S, const int N, const VI& order ){
VI classes( N, 0 );
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ];
return classes;
}
static VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
VI next_order( N, 0 ), cnt( N, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
return next_order;
}
static VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
VI next_classes( N, 0 );
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
return next_classes;
}
};
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
auto order = SuffixArray::build( S, N );
copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
return 0;
}