Fuzzy Search(FFT)

描述

Leonid works for a small and promising start-up that works on decoding the human genome. His duties include solving complex problems of finding certain patterns in long strings consisting of letters A, T, G and C.

Let’s consider the following scenario. There is a fragment of a human DNA chain, recorded as a string $S$. To analyze the fragment, you need to find all occurrences of string $T$ in a string $S$. However, the matter is complicated by the fact that the original chain fragment could contain minor mutations, which, however, complicate the task of finding a fragment. Leonid proposed the following approach to solve this problem.

Let’s write down integer $k ≥ 0$ — the error threshold. We will say that string $T$ occurs in string S on position $i$ $(1 ≤ i ≤ |S| - |T| + 1)$, if after putting string $T$ along with this position, each character of string $T$ corresponds to the some character of the same value in string $S$ at the distance of at most $k$. More formally, for any $j$ $(1 ≤ j ≤ |T|)$ there must exist such $p$ $(1 ≤ p ≤ |S|)$, that $|(i + j - 1) - p| ≤ k$ and $S[p] = T[j]$.

For example, corresponding to the given definition, string ACAT occurs in string AGCAATTCAT in positions $2$, $3$ and $6$.

Note that at $k = 0$ the given definition transforms to a simple definition of the occurrence of a string in a string.

Help Leonid by calculating in how many positions the given string $T$ occurs in the given string $S$ with the given error threshold.

Input

The first line contains three integers $|S|$, $|T|$, $k$ $(1 ≤ |T| ≤ |S| ≤ 200 000, 0 ≤ k ≤ 200 000)$ — the lengths of strings $S$ and $T$ and the error threshold.

The second line contains string $S$.

The third line contains string $T$.

Both strings consist only of uppercase letters A, T, G and C.

Output

Print a single number — the number of occurrences of $T$ in $S$ with the error threshold $k$ by the given definition.

Examples

  • input
1
2
3
10 4 1
AGCAATTCAT
ACAT
  • output
1
3

Note

If you happen to know about the structure of the human genome a little more than the author of the problem, and you are not impressed with Leonid’s original approach, do not take everything described above seriously.

思路

  • 首先可以扫一遍母串,在线性时间内用尺取的方法预处理出母串每一个位置可以匹配的字符集。
  • 然后就是模式串和母串的匹配。将四种字符分开计算贡献。对于母串每个能够匹配的位置,我们将它赋为$1$,不能匹配则赋为$0$,存在$s$中;对于模式串,如果它这一位等于当前计算的这个字符,就将这一位赋为$1$,否则赋为$0$,存在$t$中。将模式串倒置后模式串位于某一位置与主串的匹配字符数量就是$s$与$t$的卷积。如果模式串的四种字符在某一位置匹配的字符数量等于模式串的长度,就说明这个位置可以匹配。
  • 卷积只需要用FFT来计算即可。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <bits/stdc++.h>
using namespace std;
#define clr(a, x) memset(a, x, sizeof(a))
#define mp(x, y) make_pair(x, y)
#define pb(x) push_back(x)
#define X first
#define Y second
#define fastin \
ios_base::sync_with_stdio(0); \
cin.tie(0);
typedef long long ll;
typedef long double ld;
typedef pair<int, int> PII;
typedef vector<int> VI;
const int INF = 0x3f3f3f3f;
const int mod = 1e9 + 7;
const double eps = 1e-6;
const double PI = acos(-1.0);

const int maxn = 1 << 20;

struct Complex
{
double x, y;
Complex(double tx = 0.0, double ty = 0.0)
{
x = tx;
y = ty;
}
Complex operator-(const Complex& b) const
{
return Complex(x - b.x, y - b.y);
}
Complex operator+(const Complex& b) const
{
return Complex(x + b.x, y + b.y);
}
Complex operator*(const Complex& b) const
{
return Complex(x * b.x - y * b.y, x * b.y + y * b.x);
}
};


void change(Complex y[], int len)
{
for (int i = 1, j = len / 2; i < len - 1; i++)
{
if (i < j)
swap(y[i], y[j]);
int k = len / 2;
while (j >= k)
{
j -= k;
k /= 2;
}
if (j < k)
j += k;
}
}

void fft(Complex y[], int len, int on)
{
change(y, len);
for (int h = 2; h <= len; h <<= 1)
{
double x = cos(-on * 2 * PI / h), y = sin(-on * 2 * PI / h);
Complex wn(x, y);
for (int j = 0; j < len; j += h)
{
Complex w(1, 0);
for (int k = j; k < j + h / 2; k++)
{
Complex u = y[k];
Complex t = w * y[k + h / 2];
y[k] = u + t;
y[k + h / 2] = u - t;
w = w * wn;
}
}
}
if (on == -1)
for (int i = 0; i < len; i++)
y[i].x /= len;
}

char S[maxn], T[maxn];
Complex s[maxn], t[maxn];
int bitmask[maxn];
int sum[maxn];
int cnt[4];
map<char, int> M;

int main()
{
#ifndef ONLINE_JUDGE
freopen("1.in", "r", stdin);
freopen("1.out", "w", stdout);
#endif
M['A'] = 0, M['G'] = 1, M['C'] = 2, M['T'] = 3;
int n, m, k;
while (~scanf("%d%d%d", &n, &m, &k))
{
scanf("%s%s", S, T);
reverse(T, T + m);
clr(cnt, 0);
clr(sum, 0);
clr(bitmask, 0);
int L = 0, R = min(n, k) - 1;
for (int i = L; i <= R; i++) cnt[M[S[i]]]++;
for (int i = 0; i < n; i++)
{
if (R + 1 < n) cnt[M[S[++R]]]++;
if (i - L > k) cnt[M[S[L++]]]--;
for (int j = 0; j < 4; j++)
if (cnt[j]) bitmask[i] |= (1 << j);
}
int N = 1;
while (N < (n + m)) N <<= 1;
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < n; j++)
s[j] = ((bitmask[j] >> i) & 1) ? Complex(1, 0) : Complex(0, 0);
for (int j = n; j < N; j++)
s[j] = Complex(0, 0);

for (int j = 0; j < m; j++)
t[j] = M[T[j]] == i ? Complex(1, 0) : Complex(0, 0);
for (int j = m; j < N; j++)
t[j] = Complex(0, 0);

fft(s, N, 1);
fft(t, N, 1);
for (int j = 0; j < N; j++)
s[j] = s[j] * t[j];
fft(s, N, -1);
for (int j = m - 1; j < n; j++)
sum[j] += (int)(s[j].x + 0.5);
}
int ans = 0;
for (int i = m - 1; i < n; i++)
if (sum[i] == m) ans++;
printf("%d\n", ans);
}
return 0;
}
捐助作者