master
Ringo Watanabe 6 years ago
parent fbe7dd9b13
commit f6a8da49c3
No known key found for this signature in database
GPG Key ID: C1C1CD34CF2907B2

@ -4,6 +4,8 @@ import os.path
import traceback import traceback
import json import json
import socket import socket
import struct
import pprint
import time import time
import datetime import datetime
import json import json
@ -11,6 +13,7 @@ import re
import time import time
import threading import threading
import binascii import binascii
import pylzma
from cffi import FFI from cffi import FFI
import select import select
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
@ -22,44 +25,178 @@ except ImportError:
import requests import requests
def __pmap(): def __pmap():
return [ return {
("com", "c") "com": "c",
] "thread": "t",
"no": "n",
"sub": "s",
"time": "T",
"name": "N",
"trip": "f",
"country": "C",
"id": "i",
"filename": "F",
"image": "I",
"realFilename": "l",
"fileSize": "S"
}
def __encodepost(post):
pass #TODO: __pmap comp and lzma comp
def log(stre):
print (stre)
def encode_post(post):
mape = __pmap()
np = dict()
for k,v in post.items():
nk = k
if(k in mape):
nk = mape[k]
np[nk] = v
js = json.dumps(np)
data = pylzma.compress(js)
#log("Encoded post from %d to %d bytes (%.2f%% reduction)" % (len(js), len(data), 100.00-(len(data)/len(js))*100.00))
return data
def decode_post(data):
js = pylzma.decompress(data)
np = json.loads(js)
mape = dict((v,k) for k,v in __pmap().items())
post = dict()
for k,v in np.items():
nk = k
if(k in mape):
nk = mape[k]
post[nk] = v
return post
class StatBuffer: class StatBuffer:
SLV_LOW = 0 #Keep everything SLV_LOW = 0 #Keep everything
SLV_NOTEXT = 1 #Remove subject & comment SLV_NOTEXT = 1 #Remove subject & comment
SLV_NOUI = 2 #Remove all user inputed information (sub, com, filename) SLV_NOPI = 2 #Remove all poster information (sub,com,name,trip)
SLV_NOUI = 3 #Remove all user inputed information (sub, com,name,trip, file info)
SLV_HIGH = 0xFF #Keep only post number SLV_HIGH = 0xFF #Keep only post number
def __init__(): def __init__(self):
pass pass
def _encode(post, striplv): def _decode(self, data):
if(isinstance(data, int)):
return data
else:
return decode_post(data)
def _encode(self, post, striplv):
if(striplv == StatBuffer.SLV_LOW): if(striplv == StatBuffer.SLV_LOW):
return __encodepost(post) return encode_post(post)
elif(striplv == StatBuffer.SLV_NOTEXT): elif(striplv == StatBuffer.SLV_NOTEXT):
if "com" in post: if "com" in post:
del post["com"] del post["com"]
if "sub" in post: if "sub" in post:
del post["sub"] del post["sub"]
return __encodepost(post) return encode_post(post)
elif(striplv == StatBuffer.SLV_NOPI):
if "com" in post:
del post["com"]
if "sub" in post:
del post["sub"]
if "name" in post:
del post["name"]
if "trip" in post:
del post["trip"]
return encode_post(post)
elif(striplv == StatBuffer.SLV_NOUI): elif(striplv == StatBuffer.SLV_NOUI):
if "com" in post: if "com" in post:
del post["com"] del post["com"]
if "sub" in post: if "sub" in post:
del post["sub"] del post["sub"]
#TODO: Remove image stuff if "name" in post:
return __encodepost(post) del post["name"]
if "trip" in post:
del post["trip"]
if "filename" in post:
del post["filename"]
del post["fileSize"]
del post["realFilename"]
del post["image"]
return encode_post(post)
elif(striplv == StatBuffer.SLV_HIGH): elif(striplv == StatBuffer.SLV_HIGH):
return post["no"] return encode_post({"no": post["no"]})
else: return None else: return None
def write(): def write(self, post):
raise NotImplementedError("Abstract method not implemented") raise NotImplementedError("Abstract method not implemented")
def read(): def read(self):
raise NotImplementedError("Abstract method not implemented") raise NotImplementedError("Abstract method not implemented")
def close(self): pass
class MemoryBuffer(StatBuffer):
def __init__(self, level):
self.lvl = level
self.store = list()
def write(self, post):
data = super()._encode(post, self.lvl)
self.store.append(data)
def raw(self):
return json.dumps(self.store)
def clear(self):
self.store = list()
def length(self):
return len(self.store)
def read(self):
base = super()
return list(base._decode(d) for d in self.store)
def findMax(self):
mx=0
for ent in self.store:
post = super()._decode(ent)
if(post["no"]>mx): mx = post["no"]
return mx
class FileBuffer(StatBuffer):
def __init__(self, fn, level):
self.lvl = level
self.file = open(fn, "a+b")
def write(self, post):
data = super()._encode(post, self.lvl)
self.file.write(struct.pack("I", len(data)))
self.file.write(data)
def read(self):
self.file.seek(0)
posts = list()
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
#pprint.pprint(tl)
posts.append(super()._decode(self.file.read(tl)))
lentr=self.file.read(4)
self.file.seek(0, 2)
return posts
def length(self):
self.file.seek(0)
maxln = 0
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
self.file.seek(tl, 1)
maxln+=1
lentr=self.file.read(4)#wonderful language lmao
self.file.seek(0, 2)
return maxln
def close(self):
self.file.close()
def clear(self):
self.file.truncate()
def findMax(self):
self.file.seek(0)
mx=0
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
post = super()._decode(self.file.read(tl))
if(post["no"] > mx): mx = post["no"]
lentr=self.file.read(4)#wonderful language lmao
self.file.seek(0, 2)
return mx
def parse_post(post): def parse_post(post):
res = dict() res = dict()
@ -94,21 +231,35 @@ def parse_post(post):
return res return res
def parse_thread(api, board, post, last):
fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()
posts = list()
for fPost in fullThread["posts"]:
if(fPost["no"] > last):
np = parse_post(fPost)
if(np!=None):
posts.append(np)
return posts
#if we spider all pages, go from page 10 to 1
def parse_page(api, board, page, last): def parse_page(api, board, page, last):
dta = requests.get((api % board)+page+".json") dta = requests.get((api % board)+page+".json")
posts = list() posts = list()
page = dta.json() page = dta.json()
tpd=0
for thread in page["threads"]: for thread in page["threads"]:
post = thread["posts"][0] post = thread["posts"][0]
if post["no"] <= last: if post["no"] <= last:
continue #thread is not new
#are there any new posts?
for vp in thread["posts"]:
if(vp["no"] >last):
posts.extend(parse_thread(api,board,post,last))
tpd+=1
break
else: else:
fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json() posts.extend(parse_thread(api,board, post,last))
for fPost in fullThread["posts"]: tpd+=1
np = parse_post(fPost) log("\t(threads parsed this rotation: %d)"%tpd)
if(np!=None):
posts.append(np)
return posts return posts
def pnomax(last, posts): def pnomax(last, posts):
@ -117,13 +268,11 @@ def pnomax(last, posts):
if(post["no"]>mx): mx = post["no"] #we need this because of sage if(post["no"]>mx): mx = post["no"] #we need this because of sage
return mx return mx
def log(stre):
print (stre)
def buffer_write(buf, posts): def buffer_write(buf, posts):
#TODO: Write buffer stuff for post in posts:
pass buf.write(post)
#TODO: When we request buffer data from daemon, send a min post number to send back (last)
parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.") parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")
parser.add_argument("board", help="Board to spider") parser.add_argument("board", help="Board to spider")
parser.add_argument("timeout", help="Time between cycles") parser.add_argument("timeout", help="Time between cycles")
@ -133,19 +282,30 @@ parser.add_argument("--api", help="Base URL of 4chan JSON API", default="http://
args = parser.parse_args() args = parser.parse_args()
last=0 last=0
buf = None
if args.buffer !=None: if args.buffer !=None:
pass #TODO: Init buffer stuff buf = FileBuffer(args.buffer, StatBuffer.SLV_NOTEXT)
else:
buf = MemoryBuffer(StatBuffer.SLV_NOTEXT)
while True: last = buf.findMax()
log("Reading threads for %s from %d" % (args.board, last))
posts = parse_page(args.api, args.board, "1", last)
last = pnomax(last, posts)
if(len(posts)>0): try:
log("%d new posts since last cycle" % len(posts)) while True:
buffer_write(posts) log("Reading threads for %s from %d" % (args.board, last))
else: posts = parse_page(args.api, args.board, "1", last)
log("Nothing new") last = pnomax(last, posts)
time.sleep(int(args.timeout)) if(len(posts)>0):
log("\t%d new posts since last cycle" % len(posts))
buffer_write(buf, posts)
else:
log("\tnothing new")
log("Buffer written successfully")
time.sleep(int(args.timeout))
except(KeyboardInterrupt):
log("Interrupt detected")
buf.close()
log("Closing")

Binary file not shown.
Loading…
Cancel
Save