# Find overlapping regions in text
# Author: Michal Guerquin
# Date: June 2005

import sre

# return list of (start,end) tuples for all places in string that contain
# a term in terms
def find_matches (string, terms):
  matches = []
  for t in terms:
    #print "Looking for term >%s<" % t
    substring = string
    substring_index = 0
    done = False
    while not done and len(substring)>0:
      matched = sre.search(t, substring)
      if matched:
        startpos, endpos = matched.span()
        #print "  Found term in >%s< at location %s-%s" % (substring, startpos, endpos)
        matches.append((substring_index + startpos, substring_index + endpos))
        substring = substring[endpos:]
        substring_index += endpos
      else:
        #print "  Could not find term in >%s<, finishing" % substring
        done = True

  matches.sort()

  return matches

# given a list of tuples that define (start,end) indeces, return
# a list of (sometimes new) tuples that cover all (start,end) segments
def overlap (segments):
  segments.sort()
  new_segments = []

  for startpos, endpos in segments:
    #print "Considering segment (%s,%s)" % (startpos, endpos)

    # find segment numbers where this segment overlaps with the new segments
    overlap_start_segment = -1
    overlap_end_segment = -1

    # for each new segment
    for i in range(len(new_segments)):
      seg_startpos = new_segments[i][0]
      seg_endpos   = new_segments[i][1]
      #print "  Inspecting new segment (%s,%s)" % (seg_startpos, seg_endpos)
      if seg_startpos-1 <= startpos <= seg_endpos: # the segment begins in this new_segment
        overlap_start_segment = i
      if seg_startpos-1 <= endpos <= seg_endpos:   # the segment ends in this new_segment
        overlap_end_segment = i

    # special case 1
    if overlap_start_segment == -1 and overlap_end_segment == -1: # brand new segment
      new_segment = (startpos, endpos)
      #print "Adding brand new segment (%s,%s)" % (startpos, endpos)
      new_segments.insert(0, new_segment)

    # special case 2
    elif overlap_start_segment == -1 and overlap_end_segment != -1: # partially overlaps one or more segments from the left
      new_segment = (startpos, new_segments[overlap_end_segment][1])
      #print "Adding new segment (%s,%s) to replace segments %s thru %s" % (startpos, new_segments[overlap_end_segment][1], 0, overlap_end_segment)
      for i in range(0,overlap_end_segment):
        new_segments.pop(0)
      new_segments.insert(0, new_segment)

    # special case 3
    elif overlap_start_segment != -1 and overlap_end_segment == -1: # partially overlaps one or more segments from the right
      new_segment = (new_segments[overlap_end_segment][0], endpos)
      #print "Adding new segment (%s,%s) to replace segments %s thru %s" % (startpos, new_segments[overlap_end_segment][1], overlap_start_segment, len(new_segments))
      for i in range(overlap_start_segment, len(new_segments)):
        new_segments.pop(overlap_start_segment)
      new_segments.insert(0, new_segment)

    else:
      # measure how far apart the two overlapping segments are
      segment_distance = overlap_end_segment - overlap_start_segment

      if segment_distance == 0: #start and end in the same new segment
        #print "No need to add this segment"
        pass # do nothing, no need to add this
      else:
        #print "Adding this segment to replace segments %s thru %s" % (overlap_start_segment, overlap_end_segment)
        new_segment = (new_segments[overlap_start_segment][0], new_segments[overlap_end_segment][1])
  
        for i in range(segment_distance):
          new_segments.pop(overlap_start_segment) 
        new_segments.insert(0, new_segment)

    new_segments.sort()

    #print "New segments are:"
    #for i in range(len(new_segments)):
    #  print i, new_segments[i]

  return new_segments

text = "This sentence contains many repeated substrings"
terms = ["sent", "sentence", "many", "substr", "string"]

matches = find_matches(text, terms)

print "Text: ", text
print "Terms:", "  ".join(["\""+t+"\"" for t in terms])

print "Matches:"
for m in matches:
  print "(%d, %d) \"%s\"" % (m[0], m[1], text[m[0]:m[1]])

print 

matches = overlap(matches)

print "Optimized matches:"
for m in matches:
  print "(%d, %d) \"%s\"" % (m[0], m[1], text[m[0]:m[1]])

print 
