rtbwpy/rtbw.py

import sys
import os
import os.path
import traceback
import json
import socket
import struct
import pprint
import time
import datetime
import json
import re
import time
import threading
import binascii
import pylzma
from cffi import FFI
import select
from subprocess import Popen, PIPE
import argparse
try:
    import ssl
except ImportError:
    print ("error: no ssl support")
import requests

def __pmap():
	return {
		"com": "c",
		"thread": "t",
		"no": "n",
		"sub": "s",
		"time": "T",
		"name": "N",
		"trip": "f",
		"country": "C",
		"id": "i",
		"filename": "F",
		"image": "I",
		"realFilename": "l",
		"fileSize": "S"
	}


def log(stre):
	print (stre)

def encode_post(post):
	mape = __pmap()
	np = dict()
	for k,v in post.items():
		nk = k
		if(k in mape):
			nk = mape[k]
		np[nk] = v
	js = json.dumps(np)
	data = pylzma.compress(js)
	#log("Encoded post from %d to %d bytes (%.2f%% reduction)" % (len(js), len(data), 100.00-(len(data)/len(js))*100.00))
	return data

def decode_post(data):
	js = pylzma.decompress(data)
	np = json.loads(js)
	mape = dict((v,k) for k,v in __pmap().items())
	post = dict()
	for k,v in np.items():
		nk = k
		if(k in mape):
			nk = mape[k]
		post[nk] = v
	return post
class StatBuffer:
	SLV_LOW = 0 #Keep everything
	SLV_NOTEXT = 1 #Remove subject & comment
	SLV_NOPI = 2 #Remove all poster information (sub,com,name,trip)
	SLV_NOUI = 3 #Remove all user inputed information (sub, com,name,trip, file info)
	SLV_HIGH = 0xFF #Keep only post number	

	def __init__(self):
		pass
	def _decode(self, data):
		if(isinstance(data, int)):
			return data
		else:
			return decode_post(data)
	def _encode(self, post, striplv):
		if(striplv == StatBuffer.SLV_LOW):
			return encode_post(post)
		elif(striplv == StatBuffer.SLV_NOTEXT):
			if "com" in post:
				del post["com"]
			if "sub" in post:
				del post["sub"]
			return encode_post(post)
		elif(striplv == StatBuffer.SLV_NOPI):
			if "com" in post:
				del post["com"]
			if "sub" in post:
				del post["sub"]
			if "name" in post:
				del post["name"]
			if "trip" in post:
				del post["trip"]
			return encode_post(post)
		elif(striplv == StatBuffer.SLV_NOUI):
			if "com" in post:
				del post["com"]
			if "sub" in post:
				del post["sub"]
			if "name" in post:
				del post["name"]
			if "trip" in post:
				del post["trip"]
			if "filename" in post:
				del post["filename"]
				del post["fileSize"]
				del post["realFilename"]
				del post["image"]
			return encode_post(post)
		elif(striplv == StatBuffer.SLV_HIGH):
			return encode_post({"no": post["no"]})
		else: return None
	def write(self, post):
		raise NotImplementedError("Abstract method not implemented")
	def read(self):
		raise NotImplementedError("Abstract method not implemented") 
	def close(self): pass

class MemoryBuffer(StatBuffer):
	def __init__(self, level):
		self.lvl = level
		self.store = list()
	def write(self, post):
		data = super()._encode(post, self.lvl)
		self.store.append(data)
	def raw(self):
		return json.dumps(self.store)
	def clear(self):
		self.store = list()
	def length(self):
		return len(self.store)
	def read(self):
		base = super()
		return list(base._decode(d) for d in self.store)
	def findMax(self):
		mx=0
		for ent in self.store:
			post = super()._decode(ent)
			if(post["no"]>mx): mx = post["no"]
		return mx

class FileBuffer(StatBuffer):
	def __init__(self, fn, level):
		self.lvl = level
		self.file = open(fn, "a+b")
	def write(self, post):
		data = super()._encode(post, self.lvl)
		self.file.write(struct.pack("I", len(data)))
		self.file.write(data)
	def read(self):
		self.file.seek(0)
		posts = list()
		lentr=self.file.read(4)
		while( lentr != None):
			if(len(lentr)<4): break
			tl = struct.unpack("I", lentr)[0]
			#pprint.pprint(tl)
			posts.append(super()._decode(self.file.read(tl)))
			lentr=self.file.read(4)
		self.file.seek(0, 2)
		return posts
	def length(self):
		self.file.seek(0)
		maxln = 0
		lentr=self.file.read(4) 
		while(  lentr != None):
			if(len(lentr)<4): break
			tl = struct.unpack("I", lentr)[0]
			self.file.seek(tl, 1)
			maxln+=1	
			lentr=self.file.read(4)#wonderful language lmao
		self.file.seek(0, 2)
		return maxln
	def close(self):
		self.file.close()
	def clear(self):
		self.file.truncate()
	def findMax(self):
		self.file.seek(0)
		mx=0
		lentr=self.file.read(4) 
		while(  lentr != None):
			if(len(lentr)<4): break
			tl = struct.unpack("I", lentr)[0]
			post = super()._decode(self.file.read(tl))
			if(post["no"] > mx): mx = post["no"]
			lentr=self.file.read(4)#wonderful language lmao
		self.file.seek(0, 2)
		return mx

def parse_post(post):
	res = dict()
	if(not "resto" in post or post["resto"] == 0): #is thread OP
		if("sticky" in post):
			return None
	else:
		res["thread"] = post["resto"]
	res["no"] = post["no"]	
	if("com" in post):
		res["com"] = post["com"]
	if("sub" in post):
		res["sub"] = post["sub"]
	res["time"] = post["now"]

	if("name" in post and post["name"] != "Anonymous"):
		res["name"] = post["name"]
	if("trip" in post):
		res["trip"] = post["trip"]

	if("country" in post):
		res["country"] = post["country"]

	if "id" in post:
		res["id"] = post["id"]	

	if "filename" in post:
		res["filename"] = post["filename"] + post["ext"]
		res["image"] = post["md5"]
		res["realFilename"] = post["tim"]
		res["fileSize"] = post["fsize"]
	
	return res

def parse_thread(api, board, post, last):
	fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()
	posts = list()
	for fPost in fullThread["posts"]:
		if(fPost["no"] > last):
			np = parse_post(fPost)
			if(np!=None):
				posts.append(np)
	return posts
#if we spider all pages, go from page 10 to 1
def parse_page(api, board, page, last):
	dta = requests.get((api % board)+page+".json")
	posts = list()	
	page = dta.json()
	tpd=0
	for thread in page["threads"]:
		post =  thread["posts"][0]
		if post["no"] <= last:
			#thread is not new
			#are there any new posts?
			for vp in thread["posts"]:
				if(vp["no"] >last):
					posts.extend(parse_thread(api,board,post,last))
					tpd+=1
					break
		else:
			posts.extend(parse_thread(api,board, post,last))
			tpd+=1
	log("\t(threads parsed this rotation: %d)"%tpd)
	return posts

def pnomax(last, posts):
	mx=last
	for post in posts:
		if(post["no"]>mx): mx = post["no"] #we need this because of sage
	return mx


def buffer_write(buf, posts):
	for post in posts:
		buf.write(post)
#TODO: When we request buffer data from daemon, send a min post number to send back (last)
parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")
parser.add_argument("board", help="Board to spider")
parser.add_argument("timeout", help="Time between cycles")
parser.add_argument("--buffer", help="Save buffer filename (default: use in memory buffer)", default=None)
parser.add_argument("--daemon", action="store_true", help="Run as daemon")
parser.add_argument("--api", help="Base URL of 4chan JSON API", default="http://api.4chan.org/%s/")

args = parser.parse_args()
last=0
buf = None

if args.buffer !=None:
	buf = FileBuffer(args.buffer, StatBuffer.SLV_NOTEXT)
else:
	buf = MemoryBuffer(StatBuffer.SLV_NOTEXT)

last = buf.findMax()

try:
	while True:
		log("Reading threads for %s from %d" % (args.board, last))
		posts = parse_page(args.api, args.board, "1", last)
		last = pnomax(last, posts)

		if(len(posts)>0):
			log("\t%d new posts since last cycle" % len(posts))
			buffer_write(buf, posts)
		else:
			log("\tnothing new")
		log("Buffer written successfully")
		
		time.sleep(int(args.timeout))
except(KeyboardInterrupt):
	log("Interrupt detected")
	buf.close()
log("Closing")
Initial commit 6 years ago			`import sys`
			`import os`
			`import os.path`
			`import traceback`
			`import json`
			`import socket`
works now 6 years ago			`import struct`
			`import pprint`
Initial commit 6 years ago			`import time`
			`import datetime`
			`import json`
			`import re`
			`import time`
			`import threading`
			`import binascii`
works now 6 years ago			`import pylzma`
Initial commit 6 years ago			`from cffi import FFI`
			`import select`
			`from subprocess import Popen, PIPE`
			`import argparse`
			`try:`
			`import ssl`
			`except ImportError:`
			`print ("error: no ssl support")`
			`import requests`

			`def __pmap():`
works now 6 years ago			`return {`
			`"com": "c",`
			`"thread": "t",`
			`"no": "n",`
			`"sub": "s",`
			`"time": "T",`
			`"name": "N",`
			`"trip": "f",`
			`"country": "C",`
			`"id": "i",`
			`"filename": "F",`
			`"image": "I",`
			`"realFilename": "l",`
			`"fileSize": "S"`
			`}`
Initial commit 6 years ago

works now 6 years ago			`def log(stre):`
			`print (stre)`

			`def encode_post(post):`
			`mape = __pmap()`
			`np = dict()`
			`for k,v in post.items():`
			`nk = k`
			`if(k in mape):`
			`nk = mape[k]`
			`np[nk] = v`
			`js = json.dumps(np)`
			`data = pylzma.compress(js)`
			`#log("Encoded post from %d to %d bytes (%.2f%% reduction)" % (len(js), len(data), 100.00-(len(data)/len(js))*100.00))`
			`return data`

			`def decode_post(data):`
			`js = pylzma.decompress(data)`
			`np = json.loads(js)`
			`mape = dict((v,k) for k,v in __pmap().items())`
			`post = dict()`
			`for k,v in np.items():`
			`nk = k`
			`if(k in mape):`
			`nk = mape[k]`
			`post[nk] = v`
			`return post`
Initial commit 6 years ago			`class StatBuffer:`
			`SLV_LOW = 0 #Keep everything`
			`SLV_NOTEXT = 1 #Remove subject & comment`
works now 6 years ago			`SLV_NOPI = 2 #Remove all poster information (sub,com,name,trip)`
			`SLV_NOUI = 3 #Remove all user inputed information (sub, com,name,trip, file info)`
Initial commit 6 years ago			`SLV_HIGH = 0xFF #Keep only post number`

works now 6 years ago			`def __init__(self):`
Initial commit 6 years ago			`pass`
works now 6 years ago			`def _decode(self, data):`
			`if(isinstance(data, int)):`
			`return data`
			`else:`
			`return decode_post(data)`
			`def _encode(self, post, striplv):`
Initial commit 6 years ago			`if(striplv == StatBuffer.SLV_LOW):`
works now 6 years ago			`return encode_post(post)`
Initial commit 6 years ago			`elif(striplv == StatBuffer.SLV_NOTEXT):`
			`if "com" in post:`
			`del post["com"]`
			`if "sub" in post:`
			`del post["sub"]`
works now 6 years ago			`return encode_post(post)`
			`elif(striplv == StatBuffer.SLV_NOPI):`
			`if "com" in post:`
			`del post["com"]`
			`if "sub" in post:`
			`del post["sub"]`
			`if "name" in post:`
			`del post["name"]`
			`if "trip" in post:`
			`del post["trip"]`
			`return encode_post(post)`
Initial commit 6 years ago			`elif(striplv == StatBuffer.SLV_NOUI):`
			`if "com" in post:`
			`del post["com"]`
			`if "sub" in post:`
			`del post["sub"]`
works now 6 years ago			`if "name" in post:`
			`del post["name"]`
			`if "trip" in post:`
			`del post["trip"]`
			`if "filename" in post:`
			`del post["filename"]`
			`del post["fileSize"]`
			`del post["realFilename"]`
			`del post["image"]`
			`return encode_post(post)`
Initial commit 6 years ago			`elif(striplv == StatBuffer.SLV_HIGH):`
works now 6 years ago			`return encode_post({"no": post["no"]})`
Initial commit 6 years ago			`else: return None`
works now 6 years ago			`def write(self, post):`
Initial commit 6 years ago			`raise NotImplementedError("Abstract method not implemented")`
works now 6 years ago			`def read(self):`
Initial commit 6 years ago			`raise NotImplementedError("Abstract method not implemented")`
works now 6 years ago			`def close(self): pass`

			`class MemoryBuffer(StatBuffer):`
			`def __init__(self, level):`
			`self.lvl = level`
			`self.store = list()`
			`def write(self, post):`
			`data = super()._encode(post, self.lvl)`
			`self.store.append(data)`
			`def raw(self):`
			`return json.dumps(self.store)`
			`def clear(self):`
			`self.store = list()`
			`def length(self):`
			`return len(self.store)`
			`def read(self):`
			`base = super()`
			`return list(base._decode(d) for d in self.store)`
			`def findMax(self):`
			`mx=0`
			`for ent in self.store:`
			`post = super()._decode(ent)`
			`if(post["no"]>mx): mx = post["no"]`
			`return mx`

			`class FileBuffer(StatBuffer):`
			`def __init__(self, fn, level):`
			`self.lvl = level`
			`self.file = open(fn, "a+b")`
			`def write(self, post):`
			`data = super()._encode(post, self.lvl)`
			`self.file.write(struct.pack("I", len(data)))`
			`self.file.write(data)`
			`def read(self):`
			`self.file.seek(0)`
			`posts = list()`
			`lentr=self.file.read(4)`
			`while( lentr != None):`
			`if(len(lentr)<4): break`
			`tl = struct.unpack("I", lentr)[0]`
			`#pprint.pprint(tl)`
			`posts.append(super()._decode(self.file.read(tl)))`
			`lentr=self.file.read(4)`
			`self.file.seek(0, 2)`
			`return posts`
			`def length(self):`
			`self.file.seek(0)`
			`maxln = 0`
			`lentr=self.file.read(4)`
			`while( lentr != None):`
			`if(len(lentr)<4): break`
			`tl = struct.unpack("I", lentr)[0]`
			`self.file.seek(tl, 1)`
			`maxln+=1`
			`lentr=self.file.read(4)#wonderful language lmao`
			`self.file.seek(0, 2)`
			`return maxln`
			`def close(self):`
			`self.file.close()`
			`def clear(self):`
			`self.file.truncate()`
			`def findMax(self):`
			`self.file.seek(0)`
			`mx=0`
			`lentr=self.file.read(4)`
			`while( lentr != None):`
			`if(len(lentr)<4): break`
			`tl = struct.unpack("I", lentr)[0]`
			`post = super()._decode(self.file.read(tl))`
			`if(post["no"] > mx): mx = post["no"]`
			`lentr=self.file.read(4)#wonderful language lmao`
			`self.file.seek(0, 2)`
			`return mx`
Initial commit 6 years ago
			`def parse_post(post):`
			`res = dict()`
			`if(not "resto" in post or post["resto"] == 0): #is thread OP`
			`if("sticky" in post):`
			`return None`
			`else:`
			`res["thread"] = post["resto"]`
			`res["no"] = post["no"]`
			`if("com" in post):`
			`res["com"] = post["com"]`
			`if("sub" in post):`
			`res["sub"] = post["sub"]`
			`res["time"] = post["now"]`

			`if("name" in post and post["name"] != "Anonymous"):`
			`res["name"] = post["name"]`
			`if("trip" in post):`
			`res["trip"] = post["trip"]`

			`if("country" in post):`
			`res["country"] = post["country"]`

			`if "id" in post:`
			`res["id"] = post["id"]`

			`if "filename" in post:`
			`res["filename"] = post["filename"] + post["ext"]`
			`res["image"] = post["md5"]`
			`res["realFilename"] = post["tim"]`
			`res["fileSize"] = post["fsize"]`

			`return res`

works now 6 years ago			`def parse_thread(api, board, post, last):`
			`fullThread = requests.get((api % board)+"thread/"+str(post["no"])+".json").json()`
			`posts = list()`
			`for fPost in fullThread["posts"]:`
			`if(fPost["no"] > last):`
			`np = parse_post(fPost)`
			`if(np!=None):`
			`posts.append(np)`
			`return posts`
			`#if we spider all pages, go from page 10 to 1`
Initial commit 6 years ago			`def parse_page(api, board, page, last):`
			`dta = requests.get((api % board)+page+".json")`
			`posts = list()`
			`page = dta.json()`
works now 6 years ago			`tpd=0`
Initial commit 6 years ago			`for thread in page["threads"]:`
			`post = thread["posts"][0]`
			`if post["no"] <= last:`
works now 6 years ago			`#thread is not new`
			`#are there any new posts?`
			`for vp in thread["posts"]:`
			`if(vp["no"] >last):`
			`posts.extend(parse_thread(api,board,post,last))`
			`tpd+=1`
			`break`
Initial commit 6 years ago			`else:`
works now 6 years ago			`posts.extend(parse_thread(api,board, post,last))`
			`tpd+=1`
			`log("\t(threads parsed this rotation: %d)"%tpd)`
Initial commit 6 years ago			`return posts`

			`def pnomax(last, posts):`
			`mx=last`
			`for post in posts:`
			`if(post["no"]>mx): mx = post["no"] #we need this because of sage`
			`return mx`


			`def buffer_write(buf, posts):`
works now 6 years ago			`for post in posts:`
			`buf.write(post)`
			`#TODO: When we request buffer data from daemon, send a min post number to send back (last)`
Initial commit 6 years ago			`parser = argparse.ArgumentParser(description="Real-time 4chan board watcher.")`
			`parser.add_argument("board", help="Board to spider")`
			`parser.add_argument("timeout", help="Time between cycles")`
			`parser.add_argument("--buffer", help="Save buffer filename (default: use in memory buffer)", default=None)`
			`parser.add_argument("--daemon", action="store_true", help="Run as daemon")`
			`parser.add_argument("--api", help="Base URL of 4chan JSON API", default="http://api.4chan.org/%s/")`

			`args = parser.parse_args()`
			`last=0`
works now 6 years ago			`buf = None`
Initial commit 6 years ago
			`if args.buffer !=None:`
works now 6 years ago			`buf = FileBuffer(args.buffer, StatBuffer.SLV_NOTEXT)`
			`else:`
			`buf = MemoryBuffer(StatBuffer.SLV_NOTEXT)`
Initial commit 6 years ago
works now 6 years ago			`last = buf.findMax()`
Initial commit 6 years ago
works now 6 years ago			`try:`
			`while True:`
			`log("Reading threads for %s from %d" % (args.board, last))`
			`posts = parse_page(args.api, args.board, "1", last)`
			`last = pnomax(last, posts)`

			`if(len(posts)>0):`
			`log("\t%d new posts since last cycle" % len(posts))`
			`buffer_write(buf, posts)`
			`else:`
			`log("\tnothing new")`
			`log("Buffer written successfully")`

			`time.sleep(int(args.timeout))`
			`except(KeyboardInterrupt):`
			`log("Interrupt detected")`
			`buf.close()`
			`log("Closing")`