Plover Temp: Sourcing Stock Code

原本把程式拆成三部分，但維護小麻煩，不如全部做在一起。總之，我用這些 Stock Code 把近十年三大財務報表全部爬回來，約末23萬個網頁，可怕。

但目前只有分析現金流量表，工作太忙沒時間處理，手上又有書要看，得做一些取捨。雖然工作還是鳥鳥的，但今天又重新燃起 SQL Server 濃濃的愛液興致，入地獄寶山空手而回多可惜，不如多加強內在，把不知道的補起來，對自己才有幫助。這也印證書中幾句話：自己好，就換別人窮。無奈。結構性問題，不是加強自我技能就能解決的，如果結構錯了，基礎歪了，上面蓋的房子就會不牢靠。

舉例來說，健保，究竟是社會福利，企圖調整社會結構讓窮人也能看病。亦或是廉價保險，犧牲醫護人員生活水平，政府大力補貼，讓所有人都能看病。確定清楚，才能談收費，如果是社會福利，只要正常課稅就夠了，窮人不課富人多課，這樣就行了。如果是廉價保險，那便走商業保險模式，高風險國民負擔重一點就行了。

現在我真~~他媽的~~看不出收費基準在哪。

這一路也可講到油電雙漲。收費基準在哪？表面上照顧窮人，實際上圖利企業。像我還可以負擔，照價收費也沒問題。窮人不收他水電費，該死的企業麻煩請照價付費，企業是窮人嗎？整個收費基準亂七八糟~~，早說過政府吃屎長大的，特別是帶頭老大，屎吃最多~~。不說了。

雖然常常覺得某些事就這樣，動不了，但不改變自己會後悔，好煩。我只是不想後悔而已。

sourcing.py

import csv
import logging
import os
import shutil
import sqlite3

from lxml import html

from ..common import logger
from ..common import sourcing_template

class Sourcing(sourcing_template.SourcingTemplate):

def __init__(self):
sourcing_template.SourcingTemplate.__init__(self)
self.__logger = logging.getLogger()
self.URLS = {
'LISTED': 'http://brk.tse.com.tw:8000/isin/C_public.jsp?strMode=2',
'TOC': 'http://brk.tse.com.tw:8000/isin/C_public.jsp?strMode=4',
}
self.LOCAL_DIR = './dataset/stock_code/local/'
self.CSV_DIR = './dataset/stock_code/csv/'
self.DB_FILE = './db/stocktotal.db'
self.SQL_INSERT = '''insert or ignore into
StockCode(code, name, isin_code, listing_date,
market_category, industry_category, cfi_code)
values(?, ?, ?, ?, ?, ?, ?)
'''

def source(self):
self.source_url_to_local(self.LOCAL_DIR)
self.source_local_to_csv_batch(self.LOCAL_DIR, self.CSV_DIR)
self.source_csv_to_sqlite_batch(self.CSV_DIR, self.DB_FILE)

def source_url_to_local(self, dest_dir):
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
self.__wget(self.URLS['LISTED'], os.path.join(self.LOCAL_DIR, 'listed_company.html'))
self.__wget(self.URLS['TOC'], os.path.join(self.LOCAL_DIR, 'toc_company.html'))

def source_local_to_csv_batch(self, src_dir, dest_dir):
assert os.path.isdir(src_dir)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
for file in os.listdir(src_dir):
src_file = os.path.join(src_dir, file)
file_name, file_ext = os.path.splitext(file)
dest_file = os.path.join(dest_dir, file + '.csv')
self.source_local_to_csv(src_file, dest_file)

def source_local_to_csv(self, src_file, dest_file):
assert os.path.isfile(src_file)

src_file_h = open(src_file)
content = src_file_h.read()
src_file_h.close()

csv_writer = csv.writer(open(dest_file, 'w', newline=''))
table = html.fromstring(content)
for row in table.xpath('//body/table[@class="h4"]/tr'):
columns = [_.strip() for _ in row.xpath('./td/text()')]
if len(columns) is 5:
csv_writer.writerow(columns[0].split() + columns[1:4] + [''] + columns[4:])
elif len(columns) is 6:
csv_writer.writerow(columns[0].split() + columns[1:])

def source_csv_to_sqlite_batch(self, src_dir, db_file):
assert os.path.isdir(src_dir)

for file in os.listdir(src_dir):
self.source_csv_to_sqlite(os.path.join(src_dir, file), db_file)

def source_csv_to_sqlite(self, src_file, db_file):
assert os.path.isfile(src_file)
assert os.path.isfile(db_file)

conn = sqlite3.connect(db_file)
cursor = conn.cursor()
csv_reader = csv.reader(open(src_file, "r"))
for r in csv_reader:
cursor.execute(self.SQL_INSERT, (r[0], r[1], r[2], r[3], r[4], r[5], r[6]))
conn.commit()
cursor.close()
conn.close()

def __wget(self, url, dest_file):
wget = os.path.abspath('./src/thirdparty/wget/wget.exe')
assert os.path.isfile(wget)
wget_cmdline = '''%s -N \"%s\" --waitretry=3 -O %s''' % (wget, url, dest_file)
os.system(wget_cmdline)

schema.sql

drop table if exists StockCode;

create table if not exists StockCode

(

creation_dt datetime default current_timestamp,

code text unique,

name text unique,

isin_code text unique,

listing_date datetime,

market_category text,

industry_category text,

cfi_code text

);

Plover Temp

2012年10月2日星期二

Sourcing Stock Code

沒有留言:

張貼留言

2012年10月2日 星期二

Sourcing Stock Code

沒有留言:

張貼留言

2012年10月2日星期二