| Bytes | Lang | Time | Link |
|---|---|---|---|
| nan | 250131T130356Z | 138 Aspe | |
| 001 | C++ time On^2 | 150309T041452Z | Ray |
| nan | 150304T182825Z | MickyT | |
| nan | 150302T122225Z | ror3d |
Rust
Rust port of @Ray's C++ answer.
use std::cmp::{max, min};
use std::io::{self, BufRead};
fn main() {
let stdin = io::stdin();
let mut lines = stdin.lock().lines();
// Read the two input lines
let s1 = lines.next().unwrap().unwrap();
let s2 = lines.next().unwrap().unwrap();
// Convert the strings to char arrays for easier indexing
let chars1: Vec<char> = s1.chars().collect();
let chars2: Vec<char> = s2.chars().collect();
let n1 = chars1.len();
let n2 = chars2.len();
let mut max_len = 0;
let mut max_end = -1;
// Loop over the range [1-n2, n1)
for i in (1 - n2 as isize)..(n1 as isize) {
let mut f0 = 0;
let mut f1 = 0;
let mut max_len2 = 0;
let mut max_end2 = -1;
// Determine the inclusive range for j based on i
let start_j = max(i, 0) as usize; // ensures j won’t be negative
let end_j = min(n1 as isize, i + n2 as isize) as usize; // ensures j won’t exceed n1
for j in start_j..end_j {
// Because i might be negative, and j is always ≥ 0, the subindex for s2 is (j - i)
let index_in_s2 = (j as isize - i) as usize;
if chars1[j] == chars2[index_in_s2] {
f0 += 1;
f1 += 1;
} else {
f1 = f0 + 1;
f0 = 0;
}
if f1 > max_len2 {
max_len2 = f1;
max_end2 = j as isize + 1; // +1 so it's the position after the matched section
}
}
if max_len2 > max_len {
max_len = max_len2;
max_end = max_end2;
}
}
// Equivalent to assert(max_end != -1);
assert!(max_end != -1);
// Print the same output as the original code: "start_index end_index"
// Note that (max_end - max_len) + 1 is the starting index in 1-based indexing.
let start_pos = max_end - max_len as isize + 1;
println!("{} {}", start_pos, max_end);
}
C++ time: O(n^2), extra space: O(1)
It takes 0.2s to complete the 15K data on my machine.
To compile it, use:
g++ -std=c++11 -O3 code.cpp -o code
To run it, use:
./code < INPUT_FILE_THAT_CONTAINS_TWO_LINES_SPERATED_BY_A_LINE_BREAK
Explaination
The idea is simple, for string s1 and s2, we try to offset s2 by i:
s1: abcabcabc
s2: bcabcab
When offset is 3:
s1: abcabcabc
s2: bcabcab
Then, for each offset i, we perform a dynamic programing scan on s1[i:] and s2. For each j, let f[j, 0] be the maximum length d such that s1[j - d:j] == s2[j - i - d: j - i]. Similarly, let f[j, 1] be the maximum length d such that strings s1[j - d:j] and s2[j - i - d:j - i] differ by at most 1 character.
So for s1[j] == s2[j - i], we have:
f[j, 0] = f[j - 1, 0] + 1 // concat solution in f[j - 1, 0] and s1[j]
f[j, 1] = f[j - 1, 1] + 1 // concat solution in f[j - 1, 1] and s1[j]
Otherwise:
f[j, 0] = 0 // the only choice is empty string
f[j, 1] = f[j - 1, 0] + 1 // concat solution in f[j - 1, 0] and s1[j] (or s2[j - i])
And:
f[-1, 0] = f[-1, 1] = 0
Since we only need f[j - 1, :] to calculate f[j, :], only O(1) extra space is used.
Finally, the max length will be:
max(f[j, 1] for all valid j and all i)
Code
#include <string>
#include <cassert>
#include <iostream>
using namespace std;
int main() {
string s1, s2;
getline(cin, s1);
getline(cin, s2);
int n1, n2;
n1 = s1.size();
n2 = s2.size();
int max_len = 0;
int max_end = -1;
for(int i = 1 - n2; i < n1; i++) {
int f0, f1;
int max_len2 = 0;
int max_end2 = -1;
f0 = f1 = 0;
for(int j = max(i, 0), j_end = min(n1, i + n2); j < j_end; j++) {
if(s1[j] == s2[j - i]) {
f0 += 1;
f1 += 1;
} else {
f1 = f0 + 1;
f0 = 0;
}
if(f1 > max_len2) {
max_len2 = f1;
max_end2 = j + 1;
}
}
if(max_len2 > max_len) {
max_len = max_len2;
max_end = max_end2;
}
}
assert(max_end != -1);
// cout << max_len << endl;
cout << max_end - max_len + 1 << " " << max_end << endl;
}
R
Seems I was over complicating the problem with the previous solution. This is about 50% quicker (23 secs on 15k strings) than the previous one and pretty simple.
rm(list=ls(all=TRUE))
a="xxxappleyyyyyyy"
b="zapllezzz"
s=proc.time()
matchLen=1
matchIndex=1
indexA = 1
repeat {
i = 0
repeat {
srch = substring(a,indexA,indexA+matchLen+i)
if (agrepl(srch,b,max.distance=list(insertions=0,deletions=0,substitutions=1)))
i = i + 1
else {
if (i > 0) {
matchLen = matchLen + i - 1
matchIndex = indexA
}
break
}
}
indexA=indexA+1
if (indexA + matchLen > nchar(a)) break
}
c(matchIndex, matchLen + matchIndex)
print (substring(a,matchIndex, matchLen + matchIndex))
print(proc.time()-s)
This will never be a contender due to the language, but I did have a bit of fun doing it.
Not sure of the complexity of it, but over a couple of ~15k strings it takes 43 secs using a single thread. The largest portion of that was the sorting of the arrays. I tried some other libraries, but without significant improvement.
a="xxxappleyyyyyyy"
b="zapllezzz"
s=proc.time()
N=nchar
S=substring
U=unlist
V=strsplit
A=N(a)
B=N(b)
a=S(a,1:A)
b=S(b,1:B)
a=sort(a,method="quick")
b=sort(b,method="quick")
print(proc.time()-s)
C=D=1
E=X=Y=I=0
repeat{
if(N(a[C])>E && N(b[D])>E){
for(i in E:min(N(a[C]),N(b[D]))){
if (sum(U(V(S(a[C],1,i),''))==U(V(S(b[D],1,i),'')))>i-2){
F=i
} else break
}
if (F>E) {
X=A-N(a[C])+1
Y=X+F-1
E=F
}
if (a[C]<b[D])
C=C+1
else
D=D+1
} else
if(S(a[C],1,1)<S(b[D],1,1))C=C+1 else D=D+1
if(C>A||D>B)break
}
c(X,Y)
print(proc.time()-s)
Method:
- Create a suffix array for each string
- Order the suffix arrays
- Step through each of the arrays in a staggered sort of way comparing the beginning of each
C++
I tried thinking of a good algorithm to do this, but I'm a bit distracted today and couldn't think of anything that would work well. This runs at O(n^3) time, so it's veeeery slow. The other option I thought of could have been theoretically faster but would have taken O(n^2) space, and with inputs of 1M, it would have been worse even.
It's shameful, it takes 190 seconds for inputs of 15K. I'll try to improve it. Edit: Added multiprocessing. Now takes 37 seconds for 15K input on 8 threads.
#include <string>
#include <vector>
#include <sstream>
#include <chrono>
#include <thread>
#include <atomic>
#undef cin
#undef cout
#include <iostream>
using namespace std;
typedef pair<int, int> range;
int main(int argc, char ** argv)
{
string a = "xxxappleyyyyyyy";
string b = "zapllezzz";
getline(cin, a);
getline(cin, b);
range longestA;
range longestB;
using namespace std::chrono;
high_resolution_clock::time_point t1 = high_resolution_clock::now();
unsigned cores = thread::hardware_concurrency(); cores = cores > 0 ? cores : 1;
cout << "Processing on " << cores << " cores." << endl;
atomic<int> processedCount(0);
vector<thread> threads;
range* longestAs = new range[cores];
range* longestBs = new range[cores];
for (int t = 0; t < cores; ++t)
{
threads.push_back(thread([&processedCount, cores, t, &a, &b, &longestBs, &longestAs]()
{
int la = a.length();
int l = la / cores + (t==cores-1? la % cores : 0);
int lb = b.length();
int aS = t*(la/cores);
for (int i = aS; i < aS + l; ++i)
{
int count = processedCount.fetch_add(1);
if ((count+1) * 100 / la > count * 100 / la)
{
cout << (count+1) * 100 / la << "%" << endl;
}
for (int j = 0; j < lb; ++j)
{
range currentB = make_pair(j, j);
bool letterChanged = false;
for (int k = 0; k + j < lb && k + i < la; ++k)
{
if (a[i + k] == b[j + k])
{
currentB = make_pair(j, j + k);
}
else if (!letterChanged)
{
letterChanged = true;
currentB = make_pair(j, j + k);
}
else
{
break;
}
}
if (currentB.second - currentB.first > longestBs[t].second - longestBs[t].first)
{
longestBs[t] = currentB;
longestAs[t] = make_pair(i, i + currentB.second - currentB.first);
}
}
}
}));
}
longestA = make_pair(0,0);
for(int t = 0; t < cores; ++t)
{
threads[t].join();
if (longestAs[t].second - longestAs[t].first > longestA.second - longestA.first)
{
longestA = longestAs[t];
longestB = longestBs[t];
}
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
cout << "First substring at range (" << longestA.first << ", " << longestA.second << "):" << endl;
cout << a.substr(longestA.first, longestA.second - longestA.first + 1) << endl;
cout << "Second substring at range (" << longestB.first << ", " << longestB.second << "):" << endl;
cout << b.substr(longestB.first, longestB.second - longestB.first + 1) << endl;
cout << "It took me " << time_span.count() << " seconds for input lengths " << a.length() << " and " << b.length() <<"." << endl;
char c;
cin >> c;
return 0;
}