You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
7.4 KiB

6 years ago
import sys
import os
import os.path
import traceback
import json
import socket
6 years ago
import struct
import pprint
6 years ago
import time
import datetime
import json
import re
import time
import threading
import binascii
6 years ago
import pylzma
6 years ago
from cffi import FFI
import select
from subprocess import Popen, PIPE
import argparse
try:
import ssl
except ImportError:
print ("error: no ssl support")
import requests
def __pmap():
6 years ago
return {
"com": "c",
"thread": "t",
"no": "n",
"sub": "s",
"time": "T",
"name": "N",
"trip": "f",
"country": "C",
"id": "i",
"filename": "F",
"image": "I",
"realFilename": "l",
"fileSize": "S"
}
6 years ago
6 years ago
def log(stre):
print (stre)
def encode_post(post):
mape = __pmap()
np = dict()
for k,v in post.items():
nk = k
if(k in mape):
nk = mape[k]
np[nk] = v
js = json.dumps(np)
data = pylzma.compress(js)
#log("Encoded post from %d to %d bytes (%.2f%% reduction)" % (len(js), len(data), 100.00-(len(data)/len(js))*100.00))
return data
def decode_post(data):
js = pylzma.decompress(data)
np = json.loads(js)
mape = dict((v,k) for k,v in __pmap().items())
post = dict()
for k,v in np.items():
nk = k
if(k in mape):
nk = mape[k]
post[nk] = v
return post
6 years ago
class StatBuffer:
SLV_LOW = 0 #Keep everything
SLV_NOTEXT = 1 #Remove subject & comment
6 years ago
SLV_NOPI = 2 #Remove all poster information (sub,com,name,trip)
SLV_NOUI = 3 #Remove all user inputed information (sub, com,name,trip, file info)
6 years ago
SLV_HIGH = 0xFF #Keep only post number
6 years ago
def __init__(self):
6 years ago
pass
6 years ago
def _decode(self, data):
if(isinstance(data, int)):
return data
else:
return decode_post(data)
def _encode(self, post, striplv):
6 years ago
if(striplv == StatBuffer.SLV_LOW):
6 years ago
return encode_post(post)
6 years ago
elif(striplv == StatBuffer.SLV_NOTEXT):
if "com" in post:
del post["com"]
if "sub" in post:
del post["sub"]
6 years ago
return encode_post(post)
elif(striplv == StatBuffer.SLV_NOPI):
if "com" in post:
del post["com"]
if "sub" in post:
del post["sub"]
if "name" in post:
del post["name"]
if "trip" in post:
del post["trip"]
return encode_post(post)
6 years ago
elif(striplv == StatBuffer.SLV_NOUI):
if "com" in post:
del post["com"]
if "sub" in post:
del post["sub"]
6 years ago
if "name" in post:
del post["name"]
if "trip" in post:
del post["trip"]
if "filename" in post:
del post["filename"]
del post["fileSize"]
del post["realFilename"]
del post["image"]
return encode_post(post)
6 years ago
elif(striplv == StatBuffer.SLV_HIGH):
6 years ago
return encode_post({"no": post["no"]})
6 years ago
else: return None
6 years ago
def write(self, post):
6 years ago
raise NotImplementedError("Abstract method not implemented")
6 years ago
def read(self):
6 years ago
raise NotImplementedError("Abstract method not implemented")
6 years ago
def close(self): pass
class MemoryBuffer(StatBuffer):
def __init__(self, level):
self.lvl = level
self.store = list()
def write(self, post):
data = super()._encode(post, self.lvl)
self.store.append(data)
def raw(self):
return json.dumps(self.store)
def clear(self):
self.store = list()
def length(self):
return len(self.store)
def read(self):
base = super()
return list(base._decode(d) for d in self.store)
def findMax(self):
mx=0
for ent in self.store:
post = super()._decode(ent)
if(post["no"]>mx): mx = post["no"]
return mx
class FileBuffer(StatBuffer):
def __init__(self, fn, level):
self.lvl = level
self.file = open(fn, "a+b")
def write(self, post):
data = super()._encode(post, self.lvl)
self.file.write(struct.pack("I", len(data)))
self.file.write(data)
def read(self):
self.file.seek(0)
posts = list()
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
#pprint.pprint(tl)
posts.append(super()._decode(self.file.read(tl)))
lentr=self.file.read(4)
self.file.seek(0, 2)
return posts
def length(self):
self.file.seek(0)
maxln = 0
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
self.file.seek(tl, 1)
maxln+=1
lentr=self.file.read(4)#wonderful language lmao
self.file.seek(0, 2)
return maxln
def close(self):
self.file.close()
def clear(self):
self.file.truncate()
def findMax(self):
self.file.seek(0)
mx=0
lentr=self.file.read(4)
while( lentr != None):
if(len(lentr)<4): break
tl = struct.unpack("I", lentr)[0]
post = super()._decode(self.file.read(tl))
if(post["no"] > mx): mx = post["no"]
lentr=self.file.read(4)#wonderful language lmao
self.file.seek(0, 2)
return mx
6 years ago
def parse_post(post):
res = dict()
if(not "resto" in post or post["resto"] == 0): #is thread OP
if("sticky" in post):
return None
else:
res["thread"] = post["resto"]
res["no"] = post["no"]
if("com" in post):
res["com"] = post["com"]
if("sub" in post):
res["sub"] = post["sub"]
res["time"] = post["now"]
if("name" in post and post["name"] != "Anonymous"):
res["name"] = post["name"]
if("trip" in post):
res["trip"] = post["trip"]
if("country" in post):
res["country"] = post["country"]
if "id" in post:
res["id"] = post["id"]
if "filename" in post:
res["filename"] = post["filename"] + post["ext"]
res["image"] = post["md5"]
res["realFilename"] = post["tim"]
res["fileSize"] = post["fsize"]
return res
6 years ago
def parse_thread(api, board, post, last):
fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()
posts = list()
for fPost in fullThread["posts"]:
if(fPost["no"] > last):
np = parse_post(fPost)
if(np!=None):
posts.append(np)
return posts
#if we spider all pages, go from page 10 to 1
6 years ago
def parse_page(api, board, page, last):
dta = requests.get((api % board)+page+".json")
posts = list()
page = dta.json()
6 years ago
tpd=0
6 years ago
for thread in page["threads"]:
post = thread["posts"][0]
if post["no"] <= last:
6 years ago
#thread is not new
#are there any new posts?
for vp in thread["posts"]:
if(vp["no"] >last):
posts.extend(parse_thread(api,board,post,last))
tpd+=1
break
6 years ago
else:
6 years ago
posts.extend(parse_thread(api,board, post,last))
tpd+=1
log("\t(threads parsed this rotation: %d)"%tpd)
6 years ago
return posts
def pnomax(last, posts):
mx=last
for post in posts:
if(post["no"]>mx): mx = post["no"] #we need this because of sage
return mx
def buffer_write(buf, posts):
6 years ago
for post in posts:
buf.write(post)
#TODO: When we request buffer data from daemon, send a min post number to send back (last)
6 years ago
parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")
parser.add_argument("board", help="Board to spider")
parser.add_argument("timeout", help="Time between cycles")
parser.add_argument("--buffer", help="Save buffer filename (default: use in memory buffer)", default=None)
parser.add_argument("--daemon", action="store_true", help="Run as daemon")
parser.add_argument("--api", help="Base URL of 4chan JSON API", default="http://api.4chan.org/%s/")
args = parser.parse_args()
last=0
6 years ago
buf = None
6 years ago
if args.buffer !=None:
6 years ago
buf = FileBuffer(args.buffer, StatBuffer.SLV_NOTEXT)
else:
buf = MemoryBuffer(StatBuffer.SLV_NOTEXT)
6 years ago
6 years ago
last = buf.findMax()
6 years ago
6 years ago
try:
while True:
log("Reading threads for %s from %d" % (args.board, last))
posts = parse_page(args.api, args.board, "1", last)
last = pnomax(last, posts)
if(len(posts)>0):
log("\t%d new posts since last cycle" % len(posts))
buffer_write(buf, posts)
else:
log("\tnothing new")
log("Buffer written successfully")
time.sleep(int(args.timeout))
except(KeyboardInterrupt):
log("Interrupt detected")
buf.close()
log("Closing")