# Find overlapping regions in text # Author: Michal Guerquin # Date: June 2005 import sre # return list of (start,end) tuples for all places in string that contain # a term in terms def find_matches (string, terms): matches = [] for t in terms: #print "Looking for term >%s<" % t substring = string substring_index = 0 done = False while not done and len(substring)>0: matched = sre.search(t, substring) if matched: startpos, endpos = matched.span() #print " Found term in >%s< at location %s-%s" % (substring, startpos, endpos) matches.append((substring_index + startpos, substring_index + endpos)) substring = substring[endpos:] substring_index += endpos else: #print " Could not find term in >%s<, finishing" % substring done = True matches.sort() return matches # given a list of tuples that define (start,end) indeces, return # a list of (sometimes new) tuples that cover all (start,end) segments def overlap (segments): segments.sort() new_segments = [] for startpos, endpos in segments: #print "Considering segment (%s,%s)" % (startpos, endpos) # find segment numbers where this segment overlaps with the new segments overlap_start_segment = -1 overlap_end_segment = -1 # for each new segment for i in range(len(new_segments)): seg_startpos = new_segments[i][0] seg_endpos = new_segments[i][1] #print " Inspecting new segment (%s,%s)" % (seg_startpos, seg_endpos) if seg_startpos-1 <= startpos <= seg_endpos: # the segment begins in this new_segment overlap_start_segment = i if seg_startpos-1 <= endpos <= seg_endpos: # the segment ends in this new_segment overlap_end_segment = i # special case 1 if overlap_start_segment == -1 and overlap_end_segment == -1: # brand new segment new_segment = (startpos, endpos) #print "Adding brand new segment (%s,%s)" % (startpos, endpos) new_segments.insert(0, new_segment) # special case 2 elif overlap_start_segment == -1 and overlap_end_segment != -1: # partially overlaps one or more segments from the left new_segment = (startpos, new_segments[overlap_end_segment][1]) #print "Adding new segment (%s,%s) to replace segments %s thru %s" % (startpos, new_segments[overlap_end_segment][1], 0, overlap_end_segment) for i in range(0,overlap_end_segment): new_segments.pop(0) new_segments.insert(0, new_segment) # special case 3 elif overlap_start_segment != -1 and overlap_end_segment == -1: # partially overlaps one or more segments from the right new_segment = (new_segments[overlap_end_segment][0], endpos) #print "Adding new segment (%s,%s) to replace segments %s thru %s" % (startpos, new_segments[overlap_end_segment][1], overlap_start_segment, len(new_segments)) for i in range(overlap_start_segment, len(new_segments)): new_segments.pop(overlap_start_segment) new_segments.insert(0, new_segment) else: # measure how far apart the two overlapping segments are segment_distance = overlap_end_segment - overlap_start_segment if segment_distance == 0: #start and end in the same new segment #print "No need to add this segment" pass # do nothing, no need to add this else: #print "Adding this segment to replace segments %s thru %s" % (overlap_start_segment, overlap_end_segment) new_segment = (new_segments[overlap_start_segment][0], new_segments[overlap_end_segment][1]) for i in range(segment_distance): new_segments.pop(overlap_start_segment) new_segments.insert(0, new_segment) new_segments.sort() #print "New segments are:" #for i in range(len(new_segments)): # print i, new_segments[i] return new_segments text = "This sentence contains many repeated substrings" terms = ["sent", "sentence", "many", "substr", "string"] matches = find_matches(text, terms) print "Text: ", text print "Terms:", " ".join(["\""+t+"\"" for t in terms]) print "Matches:" for m in matches: print "(%d, %d) \"%s\"" % (m[0], m[1], text[m[0]:m[1]]) print matches = overlap(matches) print "Optimized matches:" for m in matches: print "(%d, %d) \"%s\"" % (m[0], m[1], text[m[0]:m[1]]) print