Return a list of deletions and insertions that will turn 'a' into 'b'. This is done by traversing an implicit edit graph and searching for the shortest route. The basic idea is as follows: - Matching a character is free as long as there was no deletion/insertion befor
(a, b, sline=0)
| 50 | |
| 51 | |
| 52 | def diff(a, b, sline=0): |
| 53 | """ |
| 54 | Return a list of deletions and insertions that will turn 'a' into 'b'. This |
| 55 | is done by traversing an implicit edit graph and searching for the shortest |
| 56 | route. The basic idea is as follows: |
| 57 | |
| 58 | - Matching a character is free as long as there was no |
| 59 | deletion/insertion before. Then, matching will be seen as delete + |
| 60 | insert [1]. |
| 61 | - Deleting one character has the same cost everywhere. Each additional |
| 62 | character costs only have of the first deletion. |
| 63 | - Insertion is cheaper the earlier it happens. The first character is |
| 64 | more expensive that any later [2]. |
| 65 | |
| 66 | [1] This is that world -> aolsa will be "D" world + "I" aolsa instead of |
| 67 | "D" w , "D" rld, "I" a, "I" lsa |
| 68 | [2] This is that "hello\n\n" -> "hello\n\n\n" will insert a newline after |
| 69 | hello and not after \n |
| 70 | """ |
| 71 | d = defaultdict(list) |
| 72 | seen = defaultdict(lambda: sys.maxsize) |
| 73 | |
| 74 | d[0] = [(0, 0, sline, 0, ())] |
| 75 | cost = 0 |
| 76 | deletion_cost = len(a) + len(b) |
| 77 | insertion_cost = len(a) + len(b) |
| 78 | while True: |
| 79 | while len(d[cost]): |
| 80 | x, y, line, col, what = d[cost].pop() |
| 81 | |
| 82 | if a[x:] == b[y:]: |
| 83 | return what |
| 84 | |
| 85 | if x < len(a) and y < len(b) and a[x] == b[y]: |
| 86 | ncol = col + 1 |
| 87 | nline = line |
| 88 | if a[x] == "\n": |
| 89 | ncol = 0 |
| 90 | nline += 1 |
| 91 | lcost = cost + 1 |
| 92 | if ( |
| 93 | what |
| 94 | and what[-1][0] == "D" |
| 95 | and what[-1][1] == line |
| 96 | and what[-1][2] == col |
| 97 | and a[x] != "\n" |
| 98 | ): |
| 99 | # Matching directly after a deletion should be as costly as |
| 100 | # DELETE + INSERT + a bit |
| 101 | lcost = (deletion_cost + insertion_cost) * 1.5 |
| 102 | if seen[x + 1, y + 1] > lcost: |
| 103 | d[lcost].append((x + 1, y + 1, nline, ncol, what)) |
| 104 | seen[x + 1, y + 1] = lcost |
| 105 | if y < len(b): # INSERT |
| 106 | ncol = col + 1 |
| 107 | nline = line |
| 108 | if b[y] == "\n": |
| 109 | ncol = 0 |