#! /usr/bin/python import sys import psycopg2 import datetime import time # Returns similarity between station0 and station1 in range <0.0; 1.0> def Similarity(a, b): result = 0.0 station0, strength0 = a station1, strength1 = b size0 = len(station0) size1 = len(station1) if strength0[0] > strength1[0]: same = 0.0 for i in station1: if i in station0: same += 1 result = same/len(station1) elif strength0[0] < strength1[0]: same = 0.0 for i in station0: if i in station1: same += 1 result = same/len(station0) else: result = float(len(set(station0)&set(station1)))/float(len(set(station0)|set(station1))) return result def Histogram(c, time, sequence, sa, sz): hash = {} c.execute("""SELECT DISTINCT ON (time, sequence) time, sequence, (extract('epoch' FROM (time-%s::TIMESTAMP WITH TIME ZONE)))::float/(sequence-%s::BIGINT)::float FROM sputnik.ccc23 WHERE id IS NULL AND sequence BETWEEN %s::BIGINT AND %s::BIGINT AND time > %s::TIMESTAMP WITH TIME ZONE AND sequence > %s::BIGINT""", (time, sequence, sa, sz, time, sequence)) i = c.fetchone() while i != None: k = int(i[2]*1000) # Only reasonable values # According to source code analysis, sleep time is between 2 and 4 seconds # and we need some error margin if 1000 <= k and k <= 5000: hash[k] = hash.get(k, 0)+1 i = c.fetchone() if len(hash) > 0: m = max(hash.values()) for i in xrange(1000, 5001): # Let's take the smallest max if m == hash.get(i, 0): result = float(i)/1000.0 break return result, m else: return 0.0, 0 def Fetch(c, time, sequence, slope, sa, sz): result = [[time, sequence, slope, 0.0]] c.execute("""SELECT sputnik.array_accum(station), sputnik.array_accum(strength) FROM sputnik.ccc23 WHERE id IS NULL AND time = %s::TIMESTAMP WITH TIME ZONE AND sequence = %s::BIGINT""", (time, sequence)) i = c.fetchone() if i != None: result[0].append(i[0]) result[0].append(i[1]) i = c.fetchall() # Union of first 100s and the rest c.execute("""SELECT time, sequence, (extract('epoch' FROM (time-%s::TIMESTAMP WITH TIME ZONE)))::float/(sequence-%s::BIGINT)::float, 0.0, sputnik.array_accum(station), sputnik.array_accum(strength) FROM sputnik.ccc23 WHERE id IS NULL AND sequence > %s::BIGINT AND sequence <= %s::BIGINT+100::BIGINT AND time > %s::TIMESTAMP WITH TIME ZONE AND time <= %s::TIMESTAMP WITH TIME ZONE+'100 second'::INTERVAL GROUP BY time, sequence UNION SELECT time, sequence, (extract('epoch' FROM (time-%s::TIMESTAMP WITH TIME ZONE)))::float/(sequence-%s::BIGINT)::float, 0.0, sputnik.array_accum(station), sputnik.array_accum(strength) FROM sputnik.ccc23 WHERE id IS NULL AND sequence BETWEEN %s::BIGINT AND %s::BIGINT AND time > %s::TIMESTAMP WITH TIME ZONE AND sequence > %s::BIGINT AND (extract('epoch' FROM (time-%s::TIMESTAMP WITH TIME ZONE)))::float/(sequence-%s::BIGINT)::float BETWEEN %s::float-sputnik.BorderWidth(sequence-%s) AND %s::float+sputnik.BorderWidth(sequence-%s) GROUP BY time, sequence ORDER BY time""", (time, sequence, sequence, sequence, time, time, time, sequence, sa, sz, time, sequence, time, sequence, slope, sequence, slope, sequence)) i = c.fetchone() while i != None: result.append([i[0], i[1], i[2], i[2]-result[-1][2], i[4], i[5]]) i = c.fetchone() return result def Lines(data): result = [] candidate = [] for i in data: num = 0 for j in candidate: if i[0] > j[-1][0] and i[1] > j[-1][1]: num += 1 if len(candidate) == num: if len(candidate) == 1: result.extend(candidate[0]) elif len(candidate) > 1: result.append(candidate) candidate = [[i]] else: for j in candidate: if i[0] > j[-1][0] and i[1] > j[-1][1]: j.append(i) if 0 == num: candidate.append([i]) # Add last alternative if len(candidate) == 1: result.extend(candidate[0]) elif len(candidate) > 1: result.append(candidate) return result def Line(lines): result = [] for i in xrange(len(lines)): if type(lines[i][0]) != type([]): result.append(lines[i]) else: alternatives = [] if len(result) > 0: for j in lines[i]: if j[0][0] > result[-1][0] and j[0][1] > result[-1][1]: alternatives.append(j) else: alternatives = lines[i] scores = [0] * len(alternatives) sizes = map(lambda x: len(x), alternatives) best = max(sizes) for j in xrange(len(alternatives)): if sizes[j] == best: scores[j] += 1 stationsa = map(lambda x: Similarity((result[-1][4], result[-1][5]), (x[0][4], x[0][5])), alternatives) best = max(stationsa) for j in xrange(len(alternatives)): if stationsa[j] == best: scores[j] += 1 if i+1 < len(lines) and type(lines[i+1][0]) != type([]): stationsz = map(lambda x: Similarity((x[-1][4], x[-1][5]), (lines[i+1][4], lines[i+1][5])), alternatives) best = max(stationsz) for j in xrange(len(alternatives)): if stationsz[j] == best: scores[j] += 1 slopesa = map(lambda x: abs(alternatives[x][0][3]-result[-1][3]), xrange(len(alternatives))) best = min(slopesa) for j in xrange(len(alternatives)): if slopesa[j] == best: scores[j] += 1 if i+1 < len(lines) and type(lines[i+1][0]) != type([]): slopesz = map(lambda x: abs(alternatives[x][0][3]-lines[i+1][3]), xrange(len(alternatives))) best = max(slopesz) for j in xrange(len(alternatives)): if slopesz[j] == best: scores[j] += 1 best = max(scores) for j in xrange(len(alternatives)): # I chose the first one with highers score # Maybe take adjacency table into consideration? # But how to do this for more than one station? if scores[j] == best: result.extend(alternatives[j]) break # Count slope deltas once more, for final line proposal slope = result[0][2] for i in result: i[3] = i[2]-slope slope = i[2] return result def Break(a, b, c, d, slope): result = 0.0 SlopeDiff = 10.0 SlopeTrigger = 0.01 CounterDiff = 100 TimeDiff = datetime.timedelta(0, 120) StationSimilarity = 0.5 if abs(c[3]) > SlopeTrigger: if abs(c[3]) > abs(b[3])*SlopeDiff: result += 1.0 if abs(c[3]) > abs(d[3])*SlopeDiff: result += 1.0 # Time is more intuitive that sequence counter # Also I do not have to think about line coefficient # if c[1] - b[1] > CounterDiff: # result += 1.0 if c[0] - b[0] > TimeDiff: result += 1.0 if Similarity((b[4], b[5]), (c[4], c[5])) < StationSimilarity: result += 1.0 SlopeAB = float((b[0]-a[0]).seconds)/(b[1]-a[1]) SlopeBC = float((c[0]-b[0]).seconds)/(c[1]-b[1]) SlopeCD = float((d[0]-c[0]).seconds)/(d[1]-c[1]) # Slopes should be similar to each other and to the main slope if slope-1.0 <= SlopeAB and SlopeAB <= slope+1.0 and (SlopeBC < slope-1.0 or slope+1.0 < SlopeBC): result += 1.0 if slope-1.0 <= SlopeCD and SlopeCD <= slope+1.0 and (SlopeBC < slope-1.0 or slope+1.0 < SlopeBC): result += 1.0 return result/6.0 def FindIDs(connection, sa, sz, id): print sa, sz, id c = connection.cursor() c.execute("""SELECT DISTINCT sequence FROM sputnik.ccc23 WHERE id IS NULL AND sequence BETWEEN %s AND %s ORDER BY sequence""", (sa, sz)) start = c.fetchall() connection.commit() c.close() c = None for s in start: s0 = s[0] c = connection.cursor() c.execute("""SELECT DISTINCT time FROM sputnik.ccc23 WHERE id IS NULL AND sequence = %s""", (s0,)) time = c.fetchall() connection.commit() c.close() c = None for t in time: t0 = t[0] c = connection.cursor() slope, count = Histogram(c, t0, s0, sa, sz) if slope > 0.0 and count >= 8: data = Fetch(c, t0, s0, slope, sa, sz) lines = Lines(data) line = Line(lines) print id, t0, s0, slope, count, len(line) for i in xrange(len(line)): skip = False if len(line[i][4]) != len(line[i][5]): print "Error in size of ", line[i] skip = True s = line[i][5][0] for j in line[i][5]: if j != s: print "Error in strength of ", line[i] skip = True if skip: break c.execute("""UPDATE sputnik.ccc23 SET id = %s WHERE id IS NULL AND time = %s::TIMESTAMP WITH TIME ZONE AND sequence = %s::BIGINT""", (id, line[i][0], line[i][1])) if i > 0 and i < len(line)-2: b = Break(line[i-1], line[i], line[i+1], line[i+2], slope) if b > 0.5: print line[i][0], line[i][1], line[i][2], line[i][3], line[i][4], line[i][5] id += 1 print "Break here, new id ", id, b print line[i+1][0], line[i+1][1], line[i+1][2], line[i+1][3], line[i+1][4], line[i+1][5] id += 1 connection.commit() c.close() c = None return id connection = psycopg2.connect("dbname=tomus") id = 1000000 cursor = connection.cursor() cursor.execute("SELECT MAX(id) FROM sputnik.ccc23 WHERE id IS NOT NULL") data = cursor.fetchall() cursor.close() cursor = None if len(data) > 0 and data[0][0] != None: print "New ID ", id = data[0][0]+1 print id id = FindIDs(connection, 0, 2*65536, id) # Very large data set, 2924448 rows id = FindIDs(connection, 2*65536, 3*65536, id) # Very large data set, 2076875 rows id = FindIDs(connection, 3*65536, 4*65536, id) # Very large data set, 1277488 rows id = FindIDs(connection, 4*65536, 5*65536, id) # Very large data set, 1016195 rows id = FindIDs(connection, 5*65536, 6*65536, id) # Very large data set, 620763 rows id = FindIDs(connection, 6*65536, 7*65536, id) for i in range(7, 61): id = FindIDs(connection, i*65536, (i+1)*65536, id) for i in range(60, 320, 10): id = FindIDs(connection, i*65536, (i+10)*65536, id) id = FindIDs(connection, 310*65536, 65535*65536L, id) print id connection.commit() connection.close() connection = None