|
|
@ -4,6 +4,8 @@ import os.path
|
|
|
|
import traceback
|
|
|
|
import traceback
|
|
|
|
import json
|
|
|
|
import json
|
|
|
|
import socket
|
|
|
|
import socket
|
|
|
|
|
|
|
|
import struct
|
|
|
|
|
|
|
|
import pprint
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
import datetime
|
|
|
|
import datetime
|
|
|
|
import json
|
|
|
|
import json
|
|
|
@ -11,6 +13,7 @@ import re
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
import threading
|
|
|
|
import threading
|
|
|
|
import binascii
|
|
|
|
import binascii
|
|
|
|
|
|
|
|
import pylzma
|
|
|
|
from cffi import FFI
|
|
|
|
from cffi import FFI
|
|
|
|
import select
|
|
|
|
import select
|
|
|
|
from subprocess import Popen, PIPE
|
|
|
|
from subprocess import Popen, PIPE
|
|
|
@ -22,44 +25,178 @@ except ImportError:
|
|
|
|
import requests
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
def __pmap():
|
|
|
|
def __pmap():
|
|
|
|
return [
|
|
|
|
return {
|
|
|
|
("com", "c")
|
|
|
|
"com": "c",
|
|
|
|
]
|
|
|
|
"thread": "t",
|
|
|
|
|
|
|
|
"no": "n",
|
|
|
|
|
|
|
|
"sub": "s",
|
|
|
|
|
|
|
|
"time": "T",
|
|
|
|
|
|
|
|
"name": "N",
|
|
|
|
|
|
|
|
"trip": "f",
|
|
|
|
|
|
|
|
"country": "C",
|
|
|
|
|
|
|
|
"id": "i",
|
|
|
|
|
|
|
|
"filename": "F",
|
|
|
|
|
|
|
|
"image": "I",
|
|
|
|
|
|
|
|
"realFilename": "l",
|
|
|
|
|
|
|
|
"fileSize": "S"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def __encodepost(post):
|
|
|
|
|
|
|
|
pass #TODO: __pmap comp and lzma comp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def log(stre):
|
|
|
|
|
|
|
|
print (stre)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode_post(post):
|
|
|
|
|
|
|
|
mape = __pmap()
|
|
|
|
|
|
|
|
np = dict()
|
|
|
|
|
|
|
|
for k,v in post.items():
|
|
|
|
|
|
|
|
nk = k
|
|
|
|
|
|
|
|
if(k in mape):
|
|
|
|
|
|
|
|
nk = mape[k]
|
|
|
|
|
|
|
|
np[nk] = v
|
|
|
|
|
|
|
|
js = json.dumps(np)
|
|
|
|
|
|
|
|
data = pylzma.compress(js)
|
|
|
|
|
|
|
|
#log("Encoded post from %d to %d bytes (%.2f%% reduction)" % (len(js), len(data), 100.00-(len(data)/len(js))*100.00))
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def decode_post(data):
|
|
|
|
|
|
|
|
js = pylzma.decompress(data)
|
|
|
|
|
|
|
|
np = json.loads(js)
|
|
|
|
|
|
|
|
mape = dict((v,k) for k,v in __pmap().items())
|
|
|
|
|
|
|
|
post = dict()
|
|
|
|
|
|
|
|
for k,v in np.items():
|
|
|
|
|
|
|
|
nk = k
|
|
|
|
|
|
|
|
if(k in mape):
|
|
|
|
|
|
|
|
nk = mape[k]
|
|
|
|
|
|
|
|
post[nk] = v
|
|
|
|
|
|
|
|
return post
|
|
|
|
class StatBuffer:
|
|
|
|
class StatBuffer:
|
|
|
|
SLV_LOW = 0 #Keep everything
|
|
|
|
SLV_LOW = 0 #Keep everything
|
|
|
|
SLV_NOTEXT = 1 #Remove subject & comment
|
|
|
|
SLV_NOTEXT = 1 #Remove subject & comment
|
|
|
|
SLV_NOUI = 2 #Remove all user inputed information (sub, com, filename)
|
|
|
|
SLV_NOPI = 2 #Remove all poster information (sub,com,name,trip)
|
|
|
|
|
|
|
|
SLV_NOUI = 3 #Remove all user inputed information (sub, com,name,trip, file info)
|
|
|
|
SLV_HIGH = 0xFF #Keep only post number
|
|
|
|
SLV_HIGH = 0xFF #Keep only post number
|
|
|
|
|
|
|
|
|
|
|
|
def __init__():
|
|
|
|
def __init__(self):
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
def _encode(post, striplv):
|
|
|
|
def _decode(self, data):
|
|
|
|
|
|
|
|
if(isinstance(data, int)):
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return decode_post(data)
|
|
|
|
|
|
|
|
def _encode(self, post, striplv):
|
|
|
|
if(striplv == StatBuffer.SLV_LOW):
|
|
|
|
if(striplv == StatBuffer.SLV_LOW):
|
|
|
|
return __encodepost(post)
|
|
|
|
return encode_post(post)
|
|
|
|
elif(striplv == StatBuffer.SLV_NOTEXT):
|
|
|
|
elif(striplv == StatBuffer.SLV_NOTEXT):
|
|
|
|
if "com" in post:
|
|
|
|
if "com" in post:
|
|
|
|
del post["com"]
|
|
|
|
del post["com"]
|
|
|
|
if "sub" in post:
|
|
|
|
if "sub" in post:
|
|
|
|
del post["sub"]
|
|
|
|
del post["sub"]
|
|
|
|
return __encodepost(post)
|
|
|
|
return encode_post(post)
|
|
|
|
|
|
|
|
elif(striplv == StatBuffer.SLV_NOPI):
|
|
|
|
|
|
|
|
if "com" in post:
|
|
|
|
|
|
|
|
del post["com"]
|
|
|
|
|
|
|
|
if "sub" in post:
|
|
|
|
|
|
|
|
del post["sub"]
|
|
|
|
|
|
|
|
if "name" in post:
|
|
|
|
|
|
|
|
del post["name"]
|
|
|
|
|
|
|
|
if "trip" in post:
|
|
|
|
|
|
|
|
del post["trip"]
|
|
|
|
|
|
|
|
return encode_post(post)
|
|
|
|
elif(striplv == StatBuffer.SLV_NOUI):
|
|
|
|
elif(striplv == StatBuffer.SLV_NOUI):
|
|
|
|
if "com" in post:
|
|
|
|
if "com" in post:
|
|
|
|
del post["com"]
|
|
|
|
del post["com"]
|
|
|
|
if "sub" in post:
|
|
|
|
if "sub" in post:
|
|
|
|
del post["sub"]
|
|
|
|
del post["sub"]
|
|
|
|
#TODO: Remove image stuff
|
|
|
|
if "name" in post:
|
|
|
|
return __encodepost(post)
|
|
|
|
del post["name"]
|
|
|
|
|
|
|
|
if "trip" in post:
|
|
|
|
|
|
|
|
del post["trip"]
|
|
|
|
|
|
|
|
if "filename" in post:
|
|
|
|
|
|
|
|
del post["filename"]
|
|
|
|
|
|
|
|
del post["fileSize"]
|
|
|
|
|
|
|
|
del post["realFilename"]
|
|
|
|
|
|
|
|
del post["image"]
|
|
|
|
|
|
|
|
return encode_post(post)
|
|
|
|
elif(striplv == StatBuffer.SLV_HIGH):
|
|
|
|
elif(striplv == StatBuffer.SLV_HIGH):
|
|
|
|
return post["no"]
|
|
|
|
return encode_post({"no": post["no"]})
|
|
|
|
else: return None
|
|
|
|
else: return None
|
|
|
|
def write():
|
|
|
|
def write(self, post):
|
|
|
|
raise NotImplementedError("Abstract method not implemented")
|
|
|
|
raise NotImplementedError("Abstract method not implemented")
|
|
|
|
def read():
|
|
|
|
def read(self):
|
|
|
|
raise NotImplementedError("Abstract method not implemented")
|
|
|
|
raise NotImplementedError("Abstract method not implemented")
|
|
|
|
|
|
|
|
def close(self): pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MemoryBuffer(StatBuffer):
|
|
|
|
|
|
|
|
def __init__(self, level):
|
|
|
|
|
|
|
|
self.lvl = level
|
|
|
|
|
|
|
|
self.store = list()
|
|
|
|
|
|
|
|
def write(self, post):
|
|
|
|
|
|
|
|
data = super()._encode(post, self.lvl)
|
|
|
|
|
|
|
|
self.store.append(data)
|
|
|
|
|
|
|
|
def raw(self):
|
|
|
|
|
|
|
|
return json.dumps(self.store)
|
|
|
|
|
|
|
|
def clear(self):
|
|
|
|
|
|
|
|
self.store = list()
|
|
|
|
|
|
|
|
def length(self):
|
|
|
|
|
|
|
|
return len(self.store)
|
|
|
|
|
|
|
|
def read(self):
|
|
|
|
|
|
|
|
base = super()
|
|
|
|
|
|
|
|
return list(base._decode(d) for d in self.store)
|
|
|
|
|
|
|
|
def findMax(self):
|
|
|
|
|
|
|
|
mx=0
|
|
|
|
|
|
|
|
for ent in self.store:
|
|
|
|
|
|
|
|
post = super()._decode(ent)
|
|
|
|
|
|
|
|
if(post["no"]>mx): mx = post["no"]
|
|
|
|
|
|
|
|
return mx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FileBuffer(StatBuffer):
|
|
|
|
|
|
|
|
def __init__(self, fn, level):
|
|
|
|
|
|
|
|
self.lvl = level
|
|
|
|
|
|
|
|
self.file = open(fn, "a+b")
|
|
|
|
|
|
|
|
def write(self, post):
|
|
|
|
|
|
|
|
data = super()._encode(post, self.lvl)
|
|
|
|
|
|
|
|
self.file.write(struct.pack("I", len(data)))
|
|
|
|
|
|
|
|
self.file.write(data)
|
|
|
|
|
|
|
|
def read(self):
|
|
|
|
|
|
|
|
self.file.seek(0)
|
|
|
|
|
|
|
|
posts = list()
|
|
|
|
|
|
|
|
lentr=self.file.read(4)
|
|
|
|
|
|
|
|
while( lentr != None):
|
|
|
|
|
|
|
|
if(len(lentr)<4): break
|
|
|
|
|
|
|
|
tl = struct.unpack("I", lentr)[0]
|
|
|
|
|
|
|
|
#pprint.pprint(tl)
|
|
|
|
|
|
|
|
posts.append(super()._decode(self.file.read(tl)))
|
|
|
|
|
|
|
|
lentr=self.file.read(4)
|
|
|
|
|
|
|
|
self.file.seek(0, 2)
|
|
|
|
|
|
|
|
return posts
|
|
|
|
|
|
|
|
def length(self):
|
|
|
|
|
|
|
|
self.file.seek(0)
|
|
|
|
|
|
|
|
maxln = 0
|
|
|
|
|
|
|
|
lentr=self.file.read(4)
|
|
|
|
|
|
|
|
while( lentr != None):
|
|
|
|
|
|
|
|
if(len(lentr)<4): break
|
|
|
|
|
|
|
|
tl = struct.unpack("I", lentr)[0]
|
|
|
|
|
|
|
|
self.file.seek(tl, 1)
|
|
|
|
|
|
|
|
maxln+=1
|
|
|
|
|
|
|
|
lentr=self.file.read(4)#wonderful language lmao
|
|
|
|
|
|
|
|
self.file.seek(0, 2)
|
|
|
|
|
|
|
|
return maxln
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
|
|
|
self.file.close()
|
|
|
|
|
|
|
|
def clear(self):
|
|
|
|
|
|
|
|
self.file.truncate()
|
|
|
|
|
|
|
|
def findMax(self):
|
|
|
|
|
|
|
|
self.file.seek(0)
|
|
|
|
|
|
|
|
mx=0
|
|
|
|
|
|
|
|
lentr=self.file.read(4)
|
|
|
|
|
|
|
|
while( lentr != None):
|
|
|
|
|
|
|
|
if(len(lentr)<4): break
|
|
|
|
|
|
|
|
tl = struct.unpack("I", lentr)[0]
|
|
|
|
|
|
|
|
post = super()._decode(self.file.read(tl))
|
|
|
|
|
|
|
|
if(post["no"] > mx): mx = post["no"]
|
|
|
|
|
|
|
|
lentr=self.file.read(4)#wonderful language lmao
|
|
|
|
|
|
|
|
self.file.seek(0, 2)
|
|
|
|
|
|
|
|
return mx
|
|
|
|
|
|
|
|
|
|
|
|
def parse_post(post):
|
|
|
|
def parse_post(post):
|
|
|
|
res = dict()
|
|
|
|
res = dict()
|
|
|
@ -94,21 +231,35 @@ def parse_post(post):
|
|
|
|
|
|
|
|
|
|
|
|
return res
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_thread(api, board, post, last):
|
|
|
|
|
|
|
|
fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()
|
|
|
|
|
|
|
|
posts = list()
|
|
|
|
|
|
|
|
for fPost in fullThread["posts"]:
|
|
|
|
|
|
|
|
if(fPost["no"] > last):
|
|
|
|
|
|
|
|
np = parse_post(fPost)
|
|
|
|
|
|
|
|
if(np!=None):
|
|
|
|
|
|
|
|
posts.append(np)
|
|
|
|
|
|
|
|
return posts
|
|
|
|
|
|
|
|
#if we spider all pages, go from page 10 to 1
|
|
|
|
def parse_page(api, board, page, last):
|
|
|
|
def parse_page(api, board, page, last):
|
|
|
|
dta = requests.get((api % board)+page+".json")
|
|
|
|
dta = requests.get((api % board)+page+".json")
|
|
|
|
posts = list()
|
|
|
|
posts = list()
|
|
|
|
page = dta.json()
|
|
|
|
page = dta.json()
|
|
|
|
|
|
|
|
tpd=0
|
|
|
|
for thread in page["threads"]:
|
|
|
|
for thread in page["threads"]:
|
|
|
|
post = thread["posts"][0]
|
|
|
|
post = thread["posts"][0]
|
|
|
|
if post["no"] <= last:
|
|
|
|
if post["no"] <= last:
|
|
|
|
continue
|
|
|
|
#thread is not new
|
|
|
|
|
|
|
|
#are there any new posts?
|
|
|
|
|
|
|
|
for vp in thread["posts"]:
|
|
|
|
|
|
|
|
if(vp["no"] >last):
|
|
|
|
|
|
|
|
posts.extend(parse_thread(api,board,post,last))
|
|
|
|
|
|
|
|
tpd+=1
|
|
|
|
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()
|
|
|
|
posts.extend(parse_thread(api,board, post,last))
|
|
|
|
for fPost in fullThread["posts"]:
|
|
|
|
tpd+=1
|
|
|
|
np = parse_post(fPost)
|
|
|
|
log("\t(threads parsed this rotation: %d)"%tpd)
|
|
|
|
if(np!=None):
|
|
|
|
|
|
|
|
posts.append(np)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return posts
|
|
|
|
return posts
|
|
|
|
|
|
|
|
|
|
|
|
def pnomax(last, posts):
|
|
|
|
def pnomax(last, posts):
|
|
|
@ -117,13 +268,11 @@ def pnomax(last, posts):
|
|
|
|
if(post["no"]>mx): mx = post["no"] #we need this because of sage
|
|
|
|
if(post["no"]>mx): mx = post["no"] #we need this because of sage
|
|
|
|
return mx
|
|
|
|
return mx
|
|
|
|
|
|
|
|
|
|
|
|
def log(stre):
|
|
|
|
|
|
|
|
print (stre)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def buffer_write(buf, posts):
|
|
|
|
def buffer_write(buf, posts):
|
|
|
|
#TODO: Write buffer stuff
|
|
|
|
for post in posts:
|
|
|
|
pass
|
|
|
|
buf.write(post)
|
|
|
|
|
|
|
|
#TODO: When we request buffer data from daemon, send a min post number to send back (last)
|
|
|
|
parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")
|
|
|
|
parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")
|
|
|
|
parser.add_argument("board", help="Board to spider")
|
|
|
|
parser.add_argument("board", help="Board to spider")
|
|
|
|
parser.add_argument("timeout", help="Time between cycles")
|
|
|
|
parser.add_argument("timeout", help="Time between cycles")
|
|
|
@ -133,19 +282,30 @@ parser.add_argument("--api", help="Base URL of 4chan JSON API", default="http://
|
|
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
args = parser.parse_args()
|
|
|
|
last=0
|
|
|
|
last=0
|
|
|
|
|
|
|
|
buf = None
|
|
|
|
|
|
|
|
|
|
|
|
if args.buffer !=None:
|
|
|
|
if args.buffer !=None:
|
|
|
|
pass #TODO: Init buffer stuff
|
|
|
|
buf = FileBuffer(args.buffer, StatBuffer.SLV_NOTEXT)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
buf = MemoryBuffer(StatBuffer.SLV_NOTEXT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
last = buf.findMax()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
|
log("Reading threads for %s from %d" % (args.board, last))
|
|
|
|
log("Reading threads for %s from %d" % (args.board, last))
|
|
|
|
posts = parse_page(args.api, args.board, "1", last)
|
|
|
|
posts = parse_page(args.api, args.board, "1", last)
|
|
|
|
last = pnomax(last, posts)
|
|
|
|
last = pnomax(last, posts)
|
|
|
|
|
|
|
|
|
|
|
|
if(len(posts)>0):
|
|
|
|
if(len(posts)>0):
|
|
|
|
log("%d new posts since last cycle" % len(posts))
|
|
|
|
log("\t%d new posts since last cycle" % len(posts))
|
|
|
|
buffer_write(posts)
|
|
|
|
buffer_write(buf, posts)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
log("Nothing new")
|
|
|
|
log("\tnothing new")
|
|
|
|
|
|
|
|
log("Buffer written successfully")
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(int(args.timeout))
|
|
|
|
time.sleep(int(args.timeout))
|
|
|
|
|
|
|
|
except(KeyboardInterrupt):
|
|
|
|
|
|
|
|
log("Interrupt detected")
|
|
|
|
|
|
|
|
buf.close()
|
|
|
|
|
|
|
|
log("Closing")
|
|
|
|