4. Suffix Array
Contents
Lecture
Memory Efficient Data Structure
Memory Storage Problem
Quiz
Memory Storage Solution
Suffix Array Order
Use the Burrows-Wheeler Transform Matrix
(i.e. the sorted Cyclic Rotations) to find and order all suffixes of the original string ababaa$
:
$ababaa
a$ababa
aa$abab
abaa$ab
ababaa$
baa$aba
babaa$a
Remove all characters after the $
delimiter and only the ordered suffixes remain:
$
a$
aa$
abaa$
ababaa$
baa$
babaa$
The Suffix Array Order is the begin index of each of the ordered suffixes in the original string:
ababaa$
01234567
order suffix
----- ------
$
a$
aa$
abaa$
ababaa$
baa$
babaa$
v
ababaa$
01234567
^
order suffix
----- ------
6 $
a$
aa$
abaa$
ababaa$
baa$
babaa$
vv
ababaa$
01234567
^^
order suffix
----- ------
6 $
5 a$
aa$
abaa$
ababaa$
baa$
babaa$
vvv
ababaa$
01234567
^^^
order suffix
----- ------
6 $
5 a$
4 aa$
abaa$
ababaa$
baa$
babaa$
vvvvv
ababaa$
01234567
^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
ababaa$
baa$
babaa$
vvvvvvv
ababaa$
01234567
^^^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
baa$
babaa$
vvvv
ababaa$
01234567
^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
babaa$
vvvvvv
ababaa$
01234567
^^^^^^
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
1 babaa$
General Strategy for Construction
Quiz
Partial Cyclic Shifts
Partial Cyclic Shift Strategy
Partial Cyclic Shift Example
Initialization
Counting Sort
Equivalence Classes
Sort Doubled Cyclic Shifts
Example
The following slides appear to have incorrect values for C6'
and C3'
. I believe the values
in yellow are reversed:
Incorrect values?
C6' = $aba
C3' = baba
Correct values?
C6' = baba
C3' = $aba
Stable Sort
Required for C0'
and C2'
to remain in sorted order since we are only sorting my the first half
and the second half is already sorted:
Pseudocode
Quiz
For this quiz, calculate start[ i ]:
start <- ( order[ i ] - L + |S| ) mod |S|
S = ababaa$
( recall: Suffix Array 'order' == begin index of the ordered suffixes of the original string S )
order suffix
----- ------
6 $
5 a$
4 aa$
2 abaa$
0 ababaa$
3 baa$
1 babaa$
L = 2
|S| = 7
( order[ 6 ] - L + |S| ) % |S| == ( 1 - 2 + 7 ) % 7 == 6
( order[ 5 ] - L + |S| ) % |S| == ( 3 - 2 + 7 ) % 7 == 1
( order[ 4 ] - L + |S| ) % |S| == ( 0 - 2 + 7 ) % 7 == 5
( order[ 3 ] - L + |S| ) % |S| == ( 2 - 2 + 7 ) % 7 == 0
( order[ 2 ] - L + |S| ) % |S| == ( 4 - 2 + 7 ) % 7 == 2
( order[ 1 ] - L + |S| ) % |S| == ( 5 - 2 + 7 ) % 7 == 3
( order[ 0 ] - L + |S| ) % |S| == ( 6 - 2 + 7 ) % 7 == 4
^
^
start[ i ] = [6,1,5,0,2,3,4] > > > > > > > > > > > > > ^
i = 6 5 4 3 2 1 0
Updating Equivalence Classes
Quiz
FAQ
Final Quiz
Problem
Summary of Pseudocode
Solutions
CPP Main Only
#include <iomanip>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
#include <cassert>
using namespace std;
using Collection = vector< int >;
using Count = Collection;
using Class = Collection;
using Order = Collection;
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
Order order( N ); { // order <- SortCharacters( S )
const auto M = 'T' + 1; // alphabet size
Count cnt( M, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto ch = S[ i ];
++cnt[ ch ]; // count of each unique char
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ]; // prefix sums ( i.e. one-past the end of each unique char's index range )
for( auto i{ N-1 }; 0 <= i; --i ){
auto ch = S[ i ];
--cnt[ ch ];
order[ cnt[ ch ] ] = i;
}
}
Class classes( N ); { // classes <- ComputeCharClasses( S, order )
classes[ order[ 0 ] ] = 0;
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1; // diff equivalence class
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ]; // same equivalence class
}
for( auto L{ 1 }; L < N; L *= 2 ){ // order <- SortDoubled( S, L, order, classes )
Count cnt( N, 0 ); {
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
}
Order next_order( N ); {
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
}
order.swap( next_order );
Class next_classes( N ); { // UpdateClasses( L, next_order, classes )
next_classes[ order[ 0 ] ] = 0;
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
}
classes.swap( next_classes );
}
copy_n( order.begin(), N, ostream_iterator< int >( cout, " " ) ); cout << endl;
return 0;
}
CPP Functions
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
using namespace std;
using VI = vector< int >;
VI sortChars( const string& S, const int N ){
const int M = 'T' + 1; // alphabet size
VI order( N ), cnt( M );
for( auto i{ 0 }; i < N; ++i ){
auto c = S[ i ];
++cnt[ c ];
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto c = S[ i ];
--cnt[ c ];
order[ cnt[ c ] ] = i;
}
return order;
}
VI computeCharClasses( const string& S, const int N, const VI& order ){
VI classes( N, 0 );
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ];
return classes;
}
VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
VI next_order( N, 0 ), cnt( N, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
return next_order;
}
VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
VI next_classes( N, 0 );
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
return next_classes;
}
VI buildSuffixArray( const string& S, const int N ){
auto order = sortChars( S, N );
auto classes = computeCharClasses( S, N, order );
for( auto L{ 1 }; L < N; L *= 2 ){
order = sortDoubled( S, N, L, order, classes );
classes = updateClasses( N, L, order, classes );
}
return order;
}
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
auto order = buildSuffixArray( S, N );
copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
return 0;
}
CPP Object
#include <iomanip>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <vector>
#include <string>
using namespace std;
using VI = vector< int >;
class SuffixArray {
public:
using VI = vector< int >;
static VI build( const string& S, const int N ){
auto order = sortChars( S, N );
auto classes = computeCharClasses( S, N, order );
for( auto L{ 1 }; L < N; L *= 2 ){
order = sortDoubled( S, N, L, order, classes );
classes = updateClasses( N, L, order, classes );
}
return order;
}
private:
static VI sortChars( const string& S, const int N ){
const int M = 'T' + 1; // alphabet size
VI order( N ), cnt( M );
for( auto i{ 0 }; i < N; ++i ){
auto c = S[ i ];
++cnt[ c ];
}
for( auto j{ 1 }; j < M; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto c = S[ i ];
--cnt[ c ];
order[ cnt[ c ] ] = i;
}
return order;
}
static VI computeCharClasses( const string& S, const int N, const VI& order ){
VI classes( N, 0 );
for( auto i{ 1 }; i < N; ++i )
if( S[ order[ i ] ] != S[ order[ i-1 ] ] )
classes[ order[ i ] ] = classes[ order[ i-1 ] ] + 1;
else
classes[ order[ i ] ] = classes[ order[ i-1 ] ];
return classes;
}
static VI sortDoubled( const string& S, const int N, const int L, const VI& order, const VI& classes ){
VI next_order( N, 0 ), cnt( N, 0 );
for( auto i{ 0 }; i < N; ++i ){
auto cl = classes[ i ];
++cnt[ cl ];
}
for( auto j{ 1 }; j < N; ++j )
cnt[ j ] += cnt[ j-1 ];
for( auto i{ N-1 }; 0 <= i; --i ){
auto start = ( order[ i ] - L + N ) % N; // start is the begin index of the doubled cyclic shift
auto cl = classes[ start ];
--cnt[ cl ];
next_order[ cnt[ cl ] ] = start;
}
return next_order;
}
static VI updateClasses( const int N, const int L, const VI& order, const VI& classes ){
VI next_classes( N, 0 );
for( auto i{ 1 }; i < N; ++i ){
auto pre = order[ i-1 ], preMid = ( pre + L ) % N,
cur = order[ i ], curMid = ( cur + L ) % N;
if( classes[ pre ] != classes[ cur ] || classes[ preMid ] != classes[ curMid ] )
next_classes[ cur ] = next_classes[ pre ] + 1; // diff equivalence class
else
next_classes[ cur ] = next_classes[ pre ]; // same equivalence class
}
return next_classes;
}
};
int main() {
string S; cin >> S;
const auto N = static_cast< int >( S.size() );
auto order = SuffixArray::build( S, N );
copy( order.begin(), order.end(), ostream_iterator< int >( cout, " " ) );
return 0;
}