345 lines
9.1 KiB
C
345 lines
9.1 KiB
C
#define _GNU_SOURCE
|
|
|
|
#include "fmindex.h"
|
|
#include "util.h"
|
|
|
|
#include <limits.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
inline static int string_index(char *s, char c) { return strchr(s, c) - s; }
|
|
|
|
static int CompareChar(const void *a, const void *b) {
|
|
char i = *(char *)a;
|
|
char j = *(char *)b;
|
|
|
|
// Dollar sign is always sorted first.
|
|
if (i == '$')
|
|
return -1;
|
|
if (j == '$')
|
|
return 1;
|
|
|
|
return i > j;
|
|
}
|
|
|
|
/* Converts the given string to a newly allocated string
|
|
* of all distinct characters, sorted in lexicographical order.
|
|
* The dollar sign is always sorted first.
|
|
* Return NULL on memory allocation error.
|
|
*/
|
|
char *TextToAlphabet(char *text, size_t sz) {
|
|
char *alphabet = calloc(2, sizeof(char));
|
|
if (!alphabet)
|
|
return NULL;
|
|
|
|
alphabet[0] = '$';
|
|
alphabet[1] = '\0';
|
|
int len = 2;
|
|
for (size_t i = 0; i < sz; ++i) {
|
|
if (!strchr(alphabet, text[i])) {
|
|
len += 1;
|
|
alphabet = realloc(alphabet, len);
|
|
alphabet[len - 2] = text[i];
|
|
alphabet[len - 1] = '\0';
|
|
}
|
|
}
|
|
|
|
qsort(alphabet, len - 1, sizeof(char), &CompareChar);
|
|
|
|
return alphabet;
|
|
}
|
|
|
|
static int CompareSuffixArray(const void *a, const void *b, void *arg) {
|
|
sa_t i = *(sa_t *)a;
|
|
sa_t j = *(sa_t *)b;
|
|
char *s = arg;
|
|
|
|
while (s[i] != '\0' && s[j] != '\0' && s[i] == s[j]) {
|
|
++i;
|
|
++j;
|
|
}
|
|
|
|
if (s[i] == '\0')
|
|
return -1;
|
|
if (s[j] == '\0')
|
|
return 1;
|
|
|
|
return s[i] > s[j];
|
|
}
|
|
|
|
/* Construct a newly allocated suffix array for the given string.
|
|
* The suffix array holds the index for each suffix in the given string.
|
|
* Then the indices are sorted by the lexicographical ordering of the suffices.
|
|
* Return NULL on memory allocation error.
|
|
*/
|
|
sa_t *ConstructSuffixArray(char *s, size_t sz) {
|
|
sa_t *suffix_array = calloc(sz + 1, sizeof(sa_t));
|
|
if (!suffix_array)
|
|
return NULL;
|
|
|
|
for (size_t i = 0; i < sz + 1; ++i)
|
|
suffix_array[i] = i;
|
|
|
|
qsort_r(suffix_array, sz + 1, sizeof(sa_t), &CompareSuffixArray, s);
|
|
|
|
return suffix_array;
|
|
}
|
|
|
|
/* Construct the Burrows-Wheeler transform of the given string,
|
|
* using the corresponding suffix array.
|
|
* Return NULL on memory allocation error.
|
|
*/
|
|
char *ConstructBWT(char *s, size_t sz, sa_t *suffix_array) {
|
|
char *bwt = calloc(sz + 2, sizeof(char));
|
|
if (!bwt)
|
|
return NULL;
|
|
|
|
for (size_t i = 0; i < sz + 1; ++i) {
|
|
sa_t n = suffix_array[i];
|
|
// Index 0 is always the dollar sign.
|
|
bwt[i] = (n) ? s[suffix_array[i] - 1] : '$';
|
|
}
|
|
bwt[sz + 1] = '\0';
|
|
|
|
return bwt;
|
|
}
|
|
|
|
/* Construct the rank matrix for the given Burrows-Wheeler transformed string.
|
|
* The rank matrix is an (alphabet X bwt) sized array that holds the accumulated
|
|
* counts of encountered characters in the BWT.
|
|
* A newly allocated rank matrix is returned, or NULL on memory error.
|
|
*/
|
|
ranks_t *ConstructRankMatrix(char *bwt, size_t sz, char *alphabet) {
|
|
size_t alphabet_sz = strlen(alphabet);
|
|
|
|
ranks_t *rank_matrix = calloc(sz * alphabet_sz, sizeof(ranks_t));
|
|
if (!rank_matrix) {
|
|
printf("Failed to allocate %lu bytes.\n", sz * alphabet_sz);
|
|
return NULL;
|
|
}
|
|
|
|
ranks_t acc[alphabet_sz];
|
|
|
|
// Initialize counts with zero.
|
|
for (size_t i = 0; i < alphabet_sz; ++i)
|
|
acc[i] = 0;
|
|
|
|
for (size_t i = 0; i < sz; ++i) {
|
|
char c = bwt[i];
|
|
|
|
// Update accumulator.
|
|
for (unsigned j = 0; j < alphabet_sz; ++j)
|
|
if (c == alphabet[j]) {
|
|
++acc[j];
|
|
break;
|
|
}
|
|
|
|
for (unsigned j = 0; j < alphabet_sz; ++j)
|
|
rank_matrix[i * alphabet_sz + j] = acc[j];
|
|
}
|
|
|
|
return rank_matrix;
|
|
}
|
|
|
|
/* Calculate the ranges in the "F column" where each character
|
|
* in the alphabet appears.
|
|
* Return a (2 X alphabet) matrix where each row means the starting
|
|
* and ending range for a character.
|
|
* Return NULL on memory error.
|
|
*/
|
|
ranges_t *ConstructCharacterRanges(char *bwt, size_t sz, char *alphabet) {
|
|
size_t alphabet_sz = strlen(alphabet);
|
|
unsigned long counts[alphabet_sz];
|
|
|
|
// Set counts to zero.
|
|
for (size_t i = 0; i < alphabet_sz; ++i)
|
|
counts[i] = 0;
|
|
|
|
// Count total amounts of characters.
|
|
for (size_t i = 0; i < sz; ++i)
|
|
counts[string_index(alphabet, bwt[i])]++;
|
|
|
|
// Accumulate counts.
|
|
size_t acc = 0;
|
|
for (size_t i = 0; i < alphabet_sz; ++i) {
|
|
counts[i] += acc;
|
|
acc = counts[i];
|
|
}
|
|
|
|
// Calculate ranges.
|
|
ranges_t *ranges = calloc(2 * alphabet_sz, sizeof(ranges_t));
|
|
if (!ranges)
|
|
return NULL;
|
|
|
|
unsigned long cur_idx = 0;
|
|
for (size_t i = 0; i < alphabet_sz; ++i) {
|
|
ranges[2 * i] = cur_idx;
|
|
ranges[2 * i + 1] = counts[i];
|
|
cur_idx = counts[i];
|
|
}
|
|
|
|
return ranges;
|
|
}
|
|
|
|
/* Find the range of matches for the given pattern in the F column of the
|
|
* given FM-index.
|
|
*/
|
|
void FMIndexFindMatchRange(fm_index *fm, char *pattern, size_t pattern_sz,
|
|
ranges_t *start, ranges_t *end) {
|
|
int p_idx = pattern_sz - 1;
|
|
char c = pattern[p_idx];
|
|
// Initial range is all instances of the last character in pattern.
|
|
*start = fm->ranges[2 * string_index(fm->alphabet, c)];
|
|
*end = fm->ranges[2 * string_index(fm->alphabet, c) + 1];
|
|
|
|
p_idx -= 1;
|
|
while (p_idx >= 0 && *end > 1) {
|
|
c = pattern[p_idx];
|
|
ranges_t range_start = fm->ranges[2 * string_index(fm->alphabet, c)];
|
|
int alphabet_idx = string_index(fm->alphabet, c);
|
|
*start =
|
|
range_start + fm->ranks[fm->alphabet_sz * (*start - 1) + alphabet_idx];
|
|
*end = range_start + fm->ranks[fm->alphabet_sz * (*end - 1) + alphabet_idx];
|
|
p_idx -= 1;
|
|
}
|
|
}
|
|
|
|
/* Find the matching indices in the original text for the given
|
|
* range in the "F column" of the Burrows-Wheeler matrix.
|
|
*/
|
|
void FMIndexFindRangeIndices(fm_index *fm, ranges_t start, ranges_t end,
|
|
unsigned long **match_indices) {
|
|
for (unsigned long i = 0; i < end - start; ++i)
|
|
(*match_indices)[i] = fm->sa[start + i];
|
|
}
|
|
|
|
fm_index *FMIndexConstruct(char *s) {
|
|
fm_index *index = malloc(sizeof(fm_index));
|
|
if (!index)
|
|
return NULL;
|
|
memset(index, 0, sizeof(fm_index));
|
|
|
|
size_t sz = strlen(s);
|
|
if (!(index->alphabet = TextToAlphabet(s, sz)))
|
|
goto error;
|
|
index->alphabet_sz = strlen(index->alphabet);
|
|
if (!(index->sa = ConstructSuffixArray(s, sz)))
|
|
goto error;
|
|
if (!(index->bwt = ConstructBWT(s, sz, index->sa)))
|
|
goto error;
|
|
++sz; // Because of the added dollar sign.
|
|
index->bwt_sz = sz;
|
|
if (!(index->ranks = ConstructRankMatrix(index->bwt, sz, index->alphabet)))
|
|
goto error;
|
|
if (!(index->ranges =
|
|
ConstructCharacterRanges(index->bwt, sz, index->alphabet)))
|
|
goto error;
|
|
|
|
return index;
|
|
|
|
error:
|
|
if (index->alphabet)
|
|
free(index->alphabet);
|
|
if (index->sa)
|
|
free(index->sa);
|
|
if (index->bwt)
|
|
free(index->bwt);
|
|
if (index->ranks)
|
|
free(index->ranks);
|
|
if (index->ranges)
|
|
free(index->ranges);
|
|
|
|
free(index);
|
|
return NULL;
|
|
}
|
|
|
|
void FMIndexFree(fm_index *index) {
|
|
free(index->alphabet);
|
|
free(index->sa);
|
|
free(index->bwt);
|
|
free(index->ranks);
|
|
free(index->ranges);
|
|
free(index);
|
|
}
|
|
|
|
int FMIndexDumpToFile(fm_index *index, char *filename) {
|
|
FILE *f = fopen(filename, "w");
|
|
if (!f)
|
|
return 0;
|
|
|
|
fwrite(&index->bwt_sz, sizeof(index->bwt_sz), 1, f);
|
|
fwrite(index->bwt, sizeof(char), index->bwt_sz, f);
|
|
fwrite(&index->alphabet_sz, sizeof(index->alphabet_sz), 1, f);
|
|
fwrite(index->alphabet, sizeof(char), index->alphabet_sz, f);
|
|
fwrite(index->ranges, sizeof(ranges_t), 2 * index->alphabet_sz, f);
|
|
fwrite(index->ranks, sizeof(ranks_t), index->bwt_sz * index->alphabet_sz, f);
|
|
fwrite(index->sa, sizeof(sa_t), index->bwt_sz, f);
|
|
|
|
fclose(f);
|
|
return 1;
|
|
}
|
|
|
|
fm_index *FMIndexReadFromFile(char *filename, int aligned) {
|
|
FILE *f = fopen(filename, "r");
|
|
if (!f)
|
|
return NULL;
|
|
|
|
fm_index *index;
|
|
|
|
if (!(index = calloc(1, sizeof(fm_index))))
|
|
goto error;
|
|
|
|
fread(&index->bwt_sz, sizeof(index->bwt_sz), 1, f);
|
|
|
|
if (!MaybeMallocAligned((void **)&index->bwt,
|
|
(index->bwt_sz + 1) * sizeof(char), aligned))
|
|
goto error;
|
|
fread(index->bwt, sizeof(char), index->bwt_sz, f);
|
|
index->bwt[index->bwt_sz] = '\0';
|
|
|
|
fread(&index->alphabet_sz, sizeof(index->alphabet_sz), 1, f);
|
|
|
|
if (!MaybeMallocAligned((void **)&index->alphabet,
|
|
(index->alphabet_sz + 1) * sizeof(char), aligned))
|
|
goto error;
|
|
fread(index->alphabet, sizeof(char), index->alphabet_sz, f);
|
|
index->alphabet[index->alphabet_sz] = '\0';
|
|
|
|
if (!MaybeMallocAligned((void **)&index->ranges,
|
|
2 * index->alphabet_sz * sizeof(ranges_t), aligned))
|
|
goto error;
|
|
fread(index->ranges, sizeof(ranges_t), 2 * index->alphabet_sz, f);
|
|
|
|
if (!MaybeMallocAligned((void **)&index->ranks,
|
|
index->bwt_sz * index->alphabet_sz * sizeof(ranks_t),
|
|
aligned))
|
|
goto error;
|
|
fread(index->ranks, sizeof(ranks_t), index->bwt_sz * index->alphabet_sz, f);
|
|
|
|
if (!MaybeMallocAligned((void **)&index->sa, index->bwt_sz * sizeof(sa_t),
|
|
aligned))
|
|
goto error;
|
|
fread(index->sa, sizeof(sa_t), index->bwt_sz, f);
|
|
|
|
fclose(f);
|
|
return index;
|
|
|
|
error:
|
|
if (index) {
|
|
if (index->bwt)
|
|
free(index->bwt);
|
|
if (index->alphabet)
|
|
free(index->alphabet);
|
|
if (index->ranges)
|
|
free(index->ranges);
|
|
if (index->ranks)
|
|
free(index->ranks);
|
|
if (index->sa)
|
|
free(index->sa);
|
|
free(index);
|
|
}
|
|
|
|
fclose(f);
|
|
return NULL;
|
|
}
|