2012年10月22日 星期一

Operating Income

((抓交替))



SQLite3 Schema:

create table if not exists OperatingIncome
(
    creation_dt datetime default current_timestamp,
    report_date datetime not null,
    stock_code text not null,
    activity_date datetime not null,
    income real,
    unique (report_date, stock_code, activity_date) on conflict ignore
);



Python sourcing.py:

import csv
import logging
import os
import xlrd

import datetime
from lxml import html

from ..common import sourcing_mops
from ..common import date_util as date_util

class Sourcing(sourcing_mops.SourcingMops):

    def __init__(self):
        self.LOGGER = logging.getLogger()
        self.URL_TEMPLATE = '''http://mops.twse.com.tw/t21/sii/t21sc03_%s_%s.html'''
        self.DATES = []
        self.HTML_DIR = '''./dataset/operating_income/html/'''
        self.CSV_DIR = '''./dataset/operating_income/csv/'''
        self.DB_FILE = './db/stocktotal.db'
        self.SQL_INSERT = '''insert or ignore into OperatingIncome(
                report_date,
                stock_code,
                activity_date,
                income
            ) values(?, ?, ?, ?)'''

    def source(self, begin_date, end_date):
        sourcing_mops.SourcingMops.init_dates(self, begin_date, end_date)
        sourcing_mops.SourcingMops.source_url_to_html(self, self.HTML_DIR)
        self.source_html_to_csv(self.HTML_DIR, self.CSV_DIR)
        sourcing_mops.SourcingMops.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)
 
    def source_html_to_csv(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for date in reversed(self.DATES):
            self.source_html_to_csv_single(src_dir, dest_dir, date)  

    def source_html_to_csv_single(self, src_dir, dest_dir, date):
        src_file = self.get_filename(src_dir, date, 'html')
        dest_file = self.get_filename(dest_dir, date, 'csv')
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_file))
        assert os.path.isfile(src_file)
     
        dest_fd = open(dest_file, 'w', newline='')
        csv_writer = csv.writer(dest_fd)
     
        src_fd = open(src_file, 'rb')
        src_content = src_fd.read()
        src_fd.close()

        content = None
        try:
            content = html.fromstring(src_content.decode('big5-hkscs').replace(' ', ' '))
        except UnicodeDecodeError as e:
            self.LOGGER.debug(e)
            content = html.fromstring(src_content.decode('gb18030').replace(' ', ' '))
         
        for category in content.xpath('//html/body/center/table'):
            for co_list in category.xpath('./tr/td[@colspan="2"]/table'):
                for co in co_list.xpath('./tr[@align="right"]'):
                    # Ignore summary of this category
                    summary = co.xpath('./th/text()')
                    if len(summary) is 1:
                        continue

                    items = co.xpath('./td/text()')
                    assert len(items) is 10
                    stock_code = items[0]

                    this_month_record = [
                        date,
                        stock_code,
                        date_util.get_this_month_by(date),
                        items[2].strip().replace(',','')
                    ]
                    last_month_record = [
                        date,
                        stock_code,
                        date_util.get_last_month_by(date),
                        items[3].strip().replace(',','')
                    ]
                    last_year_record = [
                        date,
                        stock_code,
                        date_util.get_last_year_by(date),
                        items[4].strip().replace(',','')
                    ]
                    csv_writer.writerow(this_month_record)
                    csv_writer.writerow(last_month_record)
                    csv_writer.writerow(last_year_record)
        dest_fd.close()
         
    def get_url(self, date):
        return self.URL_TEMPLATE % (date.year - 1911, date.month)
     
    def get_filename(self, src_dir, date, ext):
        return os.path.join(src_dir, date.strftime('%Y-%m') + '.' + ext)



sourcing_mops.py

((後來想想,還是得用 date => url,然後把 date 記起來,這樣才方便。之後再來想怎麼 refactoring 吧。))

# coding: big5

import csv
import logging
import os
import shutil
import sqlite3

from datetime import date
from datetime import datetime

class SourcingMops():

    def __init__(self):
        self.LOGGER = logging.getLogger()
        self.DATES = []
        self.HTML_DIR = ''
        self.XLS_DIR = ''
        self.CSV_DIR = ''
        self.DB_FILE = './db/stocktotal.db'
        self.SQL_INSERT = ''

    def init_dates(self, begin_date, end_date):
        begin = datetime.strptime(begin_date, '%Y-%m-%d')
        end = datetime.strptime(end_date, '%Y-%m-%d')
        monthly_begin = 12 * begin.year + begin.month - 1
        monthly_end = 12 * end.year + end.month
        for monthly in range(monthly_begin, monthly_end):
            year, month = divmod(monthly, 12)
            self.DATES.append(date(year, month + 1, 1))
         
    def source_url_to_html(self, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for date in self.DATES:
            url = self.get_url(date)
            dest_file = self.get_filename(dest_dir, date, 'html')
            self.__wget(url, dest_file)

    def source_zip_to_xls(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for date in self.DATES:
            src_file = self.get_filename(src_dir, date, 'zip')
            dest_file = self.get_filename(dest_dir, date, 'xls')
            self.source_zip_to_xls_single(src_file, dest_dir, dest_file)
 
    def source_zip_to_xls_single(self, src_file, dest_dir, dest_file):
        assert os.path.isfile(src_file)
        assert os.path.isdir(dest_dir)

        sevenzip_output_dir = os.path.join(dest_dir, 'sevenzip_output_dir')
        self.__sevenzip_extract(src_file, sevenzip_output_dir)
        if not os.path.exists(sevenzip_output_dir):
            self.LOGGER.info('''%s => Failure to extract''' % src_file)
            return

        file_list = os.listdir(sevenzip_output_dir)
        assert len(file_list) is 1
        sevenzip_output_file = os.path.join(sevenzip_output_dir, file_list[0])
        shutil.copy(sevenzip_output_file, dest_file)
        shutil.rmtree(sevenzip_output_dir)
     
    def source_csv_to_sqlite(self, src_dir, dest_db, sql_insert):
        assert os.path.isdir(src_dir)
        assert os.path.isfile(dest_db)
        for date in self.DATES:
            src_file = self.get_filename(src_dir, date, 'csv')
            if os.path.isfile(src_file):
                self.source_csv_to_sqlite_single(src_file, dest_db, sql_insert)
         
    def source_csv_to_sqlite_single(self, src_file, dest_db, sql_insert):
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_db))
        fd = open(src_file, 'r')
        csv_reader = csv.reader(fd)
        conn = sqlite3.connect(dest_db)
        cursor = conn.cursor()
        for row in csv_reader:
            cursor.execute(sql_insert, row)
            self.LOGGER.debug(row)
        conn.commit()
        cursor.close()
        conn.close()
        fd.close()

    def get_url(self, date):
        pass
     
    def get_filename(self, src_dir, date, ext):
        pass
     
    def __wget(self, url, dest_file):
        wget = os.path.abspath('./src/thirdparty/wget/wget.exe')
        assert os.path.isfile(wget)
        wget_cmdline = '''%s -N \"%s\" --waitretry=3 -O \"%s\"''' % (wget, url, dest_file)
        os.system(wget_cmdline)

    def __sevenzip_extract(self, src_file, dest_dir):
        sevenzip = os.path.abspath('./src/thirdparty/sevenzip/7z.exe')
        assert os.path.isfile(sevenzip)
        sevenzip_cmdline = '''%s e %s -y -o%s''' % (sevenzip, src_file, dest_dir)
        os.system(sevenzip_cmdline)



date_util.py

import datetime

def get_last_month():
    today = datetime.date.today()
    first = datetime.date(day=1, month=today.month, year=today.year)
    last_month = first - datetime.timedelta(days=1)
    return datetime.date(day=1, month=last_month.month, year=last_month.year)

def get_this_month():
    today = datetime.date.today()
    return datetime.date(day=1, month=today.month, year=today.year)

def get_yesterday():
    return datetime.date.today() - datetime.timedelta(days=1)

def get_last_month_by(someday):
    first = datetime.date(day=1, month=someday.month, year=someday.year)
    last_month = first - datetime.timedelta(days=1)
    return datetime.date(day=1, month=last_month.month, year=last_month.year)

def get_this_month_by(someday):
    return datetime.date(day=1, month=someday.month, year=someday.year)
 
def get_last_year_by(someday):
    first = datetime.date(day=1, month=someday.month, year=someday.year)
    assert first.day is 1
    return datetime.date(day=1, month=first.month, year=first.year-1)

2012年10月13日 星期六

《King Lear》



永遠不缺傻瓜。
Fools had ne'er less wit in a year,For wise men are grown foppish.They know not how their wits to wear,Their manners are so apish.

談需要。
O, reason not the need! Our basest beggarsAre in the poorest thing superfluous.Allow not nature more than nature needs,Man’s life’s as cheap as beast’s. Thou art a lady.If only to go warm were gorgeous,Why, nature needs not what thou gorgeous wear’st,Which scarcely keeps thee warm.

哭吧!哭吧!你們這些鐵石心腸的人們,為我而哭吧!!!

2012年10月12日 星期五

Python Try-Except Example


Source Code Structure

.\sourcing_balance_sheet.py
.\sourcing_cash_flow_stmt.py
.\sourcing_income_stmt.py
.\src\balance_sheet\sourcing.py
.\src\cash_flow_stmt\sourcing.py
.\src\income_stmt\sourcing.py
.\src\common\sourcing_base.py
.\src\common\logger.py



Details

.\sourcing_balance_sheet.py

import logging
import sys

import src.stock_code.getter as getter
import src.balance_sheet.sourcing as sourcing
import src.common.logger as logger

def source_url_to_local():
    logger.config_root(level=logging.DEBUG)
    s = sourcing.Sourcing()
    s.source_url_to_local('1101', 2010, 4)
    
def source_local_to_sqlite():
    logger.config_root(level=logging.INFO)
    g = getter.Getter()
    s = sourcing.Sourcing()
    for stock_code in g.get():
        try:
            s.source_local_to_sqlite(stock_code)
        except AssertionError as e:
            print("Assertion error: {0}".format(stock_code))
        
if __name__ == '__main__':
    #sys.exit(source_url_to_local())
    sys.exit(source_local_to_sqlite())



.\sourcing_cash_flow_stmt.py

import logging
import sys

import src.stock_code.getter as getter
import src.cash_flow_stmt.sourcing as sourcing
import src.common.logger as logger

def source_url_to_local():
    logger.config_root(level=logging.DEBUG)
    s = sourcing.Sourcing()
    s.source_url_to_local('1101', 2010, 4)
    
def source_local_to_sqlite():
    logger.config_root(level=logging.DEBUG)
    g = getter.Getter()
    s = sourcing.Sourcing()
    for stock_code in g.get():
        try:
            s.source_local_to_sqlite(stock_code)
        except AssertionError as e:
            print("Assertion error: {0}".format(stock_code))

if __name__ == '__main__':
    #sys.exit(source_url_to_local())
    sys.exit(source_local_to_sqlite())



.\sourcing_income_stmt.py

import logging
import sys

import src.stock_code.getter as getter
import src.income_stmt.sourcing as sourcing
import src.common.logger as logger

def source_url_to_local():
    logger.config_root(level=logging.DEBUG)
    s = sourcing.Sourcing()
    s.source_url_to_local('1101', 2010, 4)
    
def source_local_to_sqlite():
    logger.config_root(level=logging.DEBUG)
    g = getter.Getter()
    s = sourcing.Sourcing()
    for stock_code in g.get():
        try:
            s.source_local_to_sqlite(stock_code)
        except AssertionError as e:
            print("Assertion error: {0}".format(stock_code))
        
if __name__ == '__main__':
    #sys.exit(source_url_to_local())
    sys.exit(source_local_to_sqlite())



.\src\balance_sheet\sourcing.py

import os

from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.SQL_INSERT = '''insert or ignore into
            BalanceSheet(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)    
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
        
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        #local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        #sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        #sourcing_base.SourcingBase.source_deflated_to_csv(self, self.DEFLATED_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)    
        
    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/balance_sheet/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/balance_sheet/deflated/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/balance_sheet/csv/', stock_code)
        
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st32', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st34', stock_code, season, year - 1911),
        ]



.\src\cash_flow_stmt\sourcing.py

import csv
import logging
import os
import shutil

from lxml import html

from ..common import logger
from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.TEXT_DIR = ''
        self.ITEM_PREFIXES = {
            'Operating' : [
                '營業活動',
                '│營業活動'
            ],
            'Investing' : [
                '投資活動',
                '│投資活動'
            ],
            'Financing' : [
                '融資活動',
                '│融資活動',
                '理財活動',
                '不影響現金流量之融資活動'
            ],
        }
        self.SQL_INSERT = '''insert or ignore into
            CashFlowStmt(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)    
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
        
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        #local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        #sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        #self.source_deflated_to_text(self.DEFLATED_DIR, self.TEXT_DIR)
        #self.source_text_to_csv(self.TEXT_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)    
       
    def source_deflated_to_text(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            txt_file = os.path.join(dest_dir, file_name + '.txt')
            self.source_deflated_to_text_single(os.path.join(src_dir, file), txt_file)

    def source_deflated_to_text_single(self, src_file, dest_file):
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_file))
        if os.path.getsize(src_file) is 0:
            shutil.copy(src_file, dest_file)
            return
        
        src_file_fd = open(src_file, 'rb')
        content = src_file_fd.read()
        src_file_fd.close()

        table = b''
        try:
            table = html.fromstring(content.decode('utf-8'))
        except UnicodeDecodeError as e:
            self.LOGGER.debug(e)
            table = html.fromstring(content.decode('big5'))

        xpath_stmt = table.xpath('//body/table[@class="hasBorder"]/tr/td/pre/text()')
        if len(xpath_stmt) is 1:
            with open(dest_file, 'w', encoding='utf-8') as fd:
                fd.write(xpath_stmt[0].strip())
            return

        xpath_no_record = table.xpath('//body/center/h3/text()')
        if len(xpath_no_record) is 1:
            with open(dest_file, 'w', encoding='utf-8') as fd:
                fd.write(xpath_no_record[0].strip())
            return
    def source_text_to_csv(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            csv_file = os.path.join(dest_dir, file_name + '.csv')
            self.source_text_to_csv_single(os.path.join(src_dir, file), csv_file)

    def source_text_to_csv_single(self, src_txt, dest_csv):
        self.LOGGER.debug('''%s => %s''' % (src_txt, dest_csv))

        fd = open(src_txt, 'rb')
        content = fd.read()
        fd.close()
        lines = content.decode('utf-8').split('\n')
        
        # No record
        if len(lines) is 1:
            msg = lines[0]
            if msg in self.WHITE_MSG:
                self.LOGGER.info('''%s => %s => No record''' % (src_txt, msg))
            else:
                self.LOGGER.error('''%s => %s''' % (src_txt, msg))
        # Has record
        else:
            items = self.__fetch_items(lines)
            rows = self.__build_records(src_txt, items)
            csv_writer = csv.writer(open(dest_csv, 'w', newline=''))
            csv_writer.writerows(rows)

    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/cash_flow_stmt/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/cash_flow_stmt/deflated/', stock_code)
        self.TEXT_DIR = os.path.join('./dataset/cash_flow_stmt/text/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/cash_flow_stmt/csv/', stock_code)
        
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st36', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st39', stock_code, season, year - 1911),
        ]

    def __fetch_items(self, lines):
        items = {
            'Operating' : [],
            'Investing' : [],
            'Financing' : [],
        }
        for line in lines:
            line_strip = line.strip()
            for key in items:
                for prefix in self.ITEM_PREFIXES[key]:
                    if line_strip.startswith(prefix):
                        items[key].append(line)
        for key in items:
            self.LOGGER.debug('''%s: %s''', key, items[key])
        return items

    def __build_records(self, src_txt, items):
        records = []
        for item in items:
            for line in items[item]:
                words = self.__split_words(line)
                if len(words) > 2:
                    number = self.__get_number(words[1])
                    last_number = self.__get_number(words[2])
                    record = [item, number, last_number]
                    records.append(record)
                    self.LOGGER.info('''record: %s''', record)
        return records        

    def __split_words(self, line):
        words = line.split()
        word_num = len(words)
        for i, word in enumerate(words):
            if (word == '(') or (word == '($'):
                next_i = i + 1
                if next_i < word_num:
                    words[next_i] = '(' + words[next_i]

        fixed_words = []
        for word in words:
            if (word != '') and (word != '(') and (word != '($') and (word != '$'): 
                fixed_words.append(word)
        return fixed_words

    def __get_number(self, number):
        number = number.strip()
        number = number.replace('$', '').replace(',', '')
        if (number[0] == '(') and (number[-1] == ')'):
            number = '-' + number[1:-1]
        return number
        


.\src\income_stmt\sourcing.py

import os

from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.SQL_INSERT = '''insert or ignore into
            IncomeStmt(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)    
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
        
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        #local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        #sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        #sourcing_base.SourcingBase.source_deflated_to_csv(self, self.DEFLATED_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)    
        
    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/income_stmt/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/income_stmt/deflated/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/income_stmt/csv/', stock_code)
        
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st32', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st34', stock_code, season, year - 1911),
        ]



.\src\common\sourcing_base.py

import csv
import logging
import os
import re
import shutil
import sqlite3

from lxml import etree
from lxml import html

from ..common import logger

class SourcingBase():

    def __init__(self):
        self.LOGGER = logging.getLogger()
        self.URL_TEMPLATE = \
            '''http://mops.twse.com.tw/mops/web/ajax_%s?TYPEK=all&TYPEK2=&checkbtn=&co_id=%s&code1=&encodeURIComponent=1&firstin=1&isnew=false&keyword4=&off=1&queryName=co_id&season=%02d&step=1&year=%d'''
        self.URLS = []
        self.LOCAL_DIR = './'
        self.DEFLATED_DIR = './'
        self.CSV_DIR = './'
        self.DB_FILE = './db/stocktotal.db'
        self.SQL_INSERT = ''
        self.WHITE_MSG = [
            '資料庫中查無需求資料 !',
            '資料庫中查無需求資料',
            '無應編製合併財報之子公司',
            '外國發行人免申報個別財務報表資訊,請至合併財務報表查詢',
        ]
        self.SEASON_STR_MAP = {
            '01' : '-03-31',
            '02' : '-06-30',
            '03' : '-09-30',
            '04' : '-12-31'
        }
        self.REPORT_TYPE_MAP = {
            't05st32' : 'I', # Individual Income Statement
            't05st34' : 'C', # Consolidated Income Statement
            't05st36' : 'I', # Individual Cash Flow Statement
            't05st39' : 'C', # Consolidated Cash Flow Statement
            't05st31' : 'I', # Individual Balance Sheet
            't05st33' : 'C', # Consolidated Balance Sheet
        }

    def source_url_to_local(self, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for url in self.URLS:
            self.__wget(url, dest_dir)
            
    def source_local_to_deflated(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            prog_name = file[5:12]
            args = self.__parse_args(file)
            html_file = '''%s_%s_%s_%s.html''' % \
                    (prog_name, args['co_id'], args['year'], args['season'])
            shutil.copy(os.path.join(src_dir, file), os.path.join(dest_dir, html_file))  

    def source_deflated_to_csv(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            dest_file = os.path.join(dest_dir, file_name + '.csv')
            self.source_deflated_to_csv_single(os.path.join(src_dir, file), dest_file)

    def source_deflated_to_csv_single(self, src_file, dest_file):
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_file))
        src_file_fd = open(src_file, 'rb')
        content = src_file_fd.read()
        src_file_fd.close()

        # wget timeout => 0 filesize web content => should be source again.
        if content == b'':
            self.LOGGER.error('''%s => 0 filesize''' % src_file)
            return
        
        table = b''
        try:
            table = html.fromstring(content.decode('utf-8').replace('&nbsp;', ' '))
        except UnicodeDecodeError as e:
            self.LOGGER.debug(e)
            table = html.fromstring(content.decode('big5').replace('&nbsp;', ' '))
        except Exception as e:
            self.LOGGER.error(e)
            return
            
        xpath_no_record = table.xpath('//body/center/h3/text()')
        if len(xpath_no_record) is 1:
            with open(dest_file, 'w') as fd:
                fd.write(xpath_no_record[0].strip())
            return
        
        csv_writer = csv.writer(open(dest_file, 'w', newline=''))
        for tr in table.xpath('//tr'):
            tds = tr.xpath('./td/text()')
            if len(tds) is 5:
                csv_record = [tds[0].strip(), tds[1].strip(), tds[3].strip()]
                csv_writer.writerow(csv_record)            

    def source_csv_to_sqlite(self, src_dir, dest_db, sql_insert):
        assert os.path.isdir(src_dir)
        for file in os.listdir(src_dir):
            self.source_csv_to_sqlite_single(os.path.join(src_dir, file), dest_db, sql_insert)
            
    def source_csv_to_sqlite_single(self, src_file, dest_db, sql_insert):
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_db))
        assert os.path.isfile(src_file)
        assert os.path.isfile(dest_db)
        
        file_name, file_ext = os.path.splitext(os.path.basename(src_file))
        report_code, stock_code, year, season = file_name.split('_')
        report_type = self.REPORT_TYPE_MAP[report_code]
        date = self.__get_date(year, season)
        
        conn = sqlite3.connect(dest_db)
        cursor = conn.cursor()        
        csv_reader = csv.reader(open(src_file, 'r'))
        for row in csv_reader:
            if len(row) is 1:
                msg = row[0]
                if msg in self.WHITE_MSG:
                    self.LOGGER.info('''%s => %s => No record''' % (src_file, msg))
                else:
                    self.LOGGER.error('''%s => %s''' % (src_file, msg))
            elif len(row) in (2, 3):
                cursor.execute(self.SQL_INSERT, \
                        (stock_code, report_type, date, date, row[0], row[1]))
            if len(row) is 3:
                last_date = self.__get_last_date(year, season)
                cursor.execute(self.SQL_INSERT, \
                        (stock_code, report_type, date, last_date, row[0], row[2]))        
        conn.commit()
        cursor.close()
        conn.close()
                
    def __get_date(self, year, season):
        return str(int(year) + 1911) + self.SEASON_STR_MAP[season]

    def __get_last_date(self, year, season):
        return str(int(year) + 1910) + self.SEASON_STR_MAP[season]        
        
    def __wget(self, url, dest_dir):
        url_to_filepath = re.compile('https?://|ftp://').sub('', url).replace(':', '_')
        dest_file = os.path.join(dest_dir, url_to_filepath)
        dest_file_dir = os.path.dirname(dest_file)
        if not os.path.exists(dest_file_dir):
            os.makedirs(dest_file_dir)

        wget = os.path.abspath('./src/thirdparty/wget/wget.exe')
        assert os.path.isfile(wget)  
        wget_cmdline = '''%s -N \"%s\" --waitretry=3 -P %s''' % (wget, url, dest_file_dir)
        os.system(wget_cmdline)
        
    def __parse_args(self, args_line):
        args = {}
        for kvs in args_line.split('&'):
            kv = kvs.split('=')
            args[kv[0]] = kv[1]
        return args



.\src\common\logger.py

import logging
import sys

FORMAT = "%(asctime)s %(filename)s [%(levelname)s] %(message)s"
DATEFMT = "%H:%M:%S"

def config_root(level=logging.INFO,
                threshold=logging.WARNING,
                format=FORMAT,
                datefmt=DATEFMT):
    root = logging.getLogger()
    root.setLevel(level)
    formatter = logging.Formatter(format, datefmt)

    stdout_handler = logging.StreamHandler(sys.stdout)
    stdout_handler.setLevel(level)
    stdout_handler.setFormatter(logging.Formatter(format, datefmt))
    root.addHandler(stdout_handler)

    #stderr_handler = logging.StreamHandler(sys.stderr)
    #stderr_handler.setLevel(logging.ERROR)
    #stderr_handler.setFormatter(logging.Formatter(format, datefmt))
    #root.addHandler(stderr_handler)



大功告成。



《King Lear》
I cannot leave my heart into my mouth.
Cordelia 註定杯具了。

I yet beseech your majesty, --If for I want that glib and oily art,To speak and purpose not; since what I well intend,I'll do't before I speak, -- that you make knownIt is no vicious blot, murder, or foulness,No unchaste action, or dishonour'd step,That hath deprived me of your grace and favour;But even for want of that for which I am richer,A still-soliciting eye, and such a tongueAs I am glad I have not, though not to have itHath lost me in your liking.
大家愛聽美言,特別是在位君主或是總統,特別渴求贊同的眼光,愛聽好聽的話。嘴巴說用人品德要兼優,實際上就是挑哈巴狗而已。

Better thou
Hadst not been born than not to have pleased me better.
你們這些不吹我喇叭的人,下地獄吧。莎士比亞是先知,早就把軟弱無能、老愛聽美言的總統寫的清清楚楚,寫在前頭事後驗證,看吧,每次都這樣子演。

Time shall unfold what plaited cunning hides:Who cover faults, at last shame them derides.
如果愛就是把對方看的比自己重要,那,就讓我試試看吧。


2012年10月11日 星期四

《國富論》截一句

一。葡萄酒的秘密。
雖然這種葡萄園通常比其他大多數葡萄園被耕種的更用心,但是這種酒的高價格,與其說是用心耕種的結果,似乎不如說是用心耕種的原因。

人們倒因為果,常用既定印象決定許多事情。例如政府單位就不會做壞事,今天我就抓到兩個新聞,都是垃圾來的。

《一》環保局指出,中油剛開始不願證實排放油污,但稽查人員發現該廠內廢水處理設施放流槽內水體有明顯油污,還請廠方開啟強制排水,確認污水流通至外面側溝,確認污染是中油公司所為,才開單最重60萬重罰。
《二》日薪兩千元並提供食宿,她錄取後,外交部卻因她是學生,只願付日薪一千元。她質疑,「一般隨行口譯一天約八千元,再怎麼樣的生手,開價一千,要接待、口譯、還要跟著跑行程,政府怎能帶頭壓榨勞工,不重視專業人才。」她憤而拒絕這份工作。

說來說去,都是錢在作祟,為了錢,大家能夠多可惡,就能這麼可惡,表面說漂亮話,實際上他媽的狗屁倒灶。其實這也不算倒因為果,本來就不該假設「因為政府為人民著想,所以幹好事」。好的假設是「因為政府是人組成的,所以有人類該有的行為」。




二。修車花 8500。車子安全了,於是想偷騎到合歡山區把剩下的合歡東峰撿起來。




三。生活必需品?
最大的關鍵在於,房子是生活必需品,和股票等其他金融資產不同。一旦高房價造成貧富不均和社會階層的巨大落差,長期會帶來社會的嚴重矛盾和對立。

這是誰規定的?有雙手送錢自願當盤子的八卦嗎?我們左手增加了資產,右手也多了長期負債,專業的都曉得,自有資本比率超過百分之五十,才是好企業。這種價格與價值不相配的東西,他媽的當我是白痴嗎?寧可讓三百萬發呆,也不要當盤子,幹,台灣鬼島我怕你才怪。

大家自有資本比率又是多少咧?林爸百分之一百,也就是銀行最討厭的人。摸摸才是生活必需品。



四。可怕,電話講半小時。耳朵龍龍的。感謝老爸再次提醒三十,山中無歲月,時間的意義就是時鐘指針,手機鬧鈴,Outlook開會提醒,如果撇開這些東西,現在幾點,或者現在幾歲,過了幾歲應該結婚,根本就沒啥意義。

偏偏大家都有數字魔咒,特別是整數魔咒,三十大關,七千點大關,八千點大關,民國百年,好令人著迷的整數。我跟老爸仔細分析什麼叫快樂,如果他媽的我在路上隨便唬一唬女生就貼過來的話,幹他媽還真夠扯。

就不是那麼單純咩,我又不是許願者兼神燈精靈自導自演。整個就是煩。自己被老爸煩,自己也被自己煩。




五。Thank you for visiting census.gov, we are currently performing system maintenance. We apologize for any inconvenience you may experience and hope you will come back after we have completed our maintenance.

突然覺得很鳥。還是先復習一下 http://www.wretch.cc/blog/JaguarCSIA/16183273

2012年10月10日 星期三

淡水廳到噶瑪蘭廳的草嶺古道


時間:2012-10-10



跟摸摸 ((非真名)) 坐到大里火車站,今天是江國慶的生日,政府不願面對的事實,政府自以為「主持正義」,你們不應該再炒冷飯的事實。這是十多年前的司法冤案。

1996年9月12日位處台北市大安區的空軍作戰司令部營區內五歲謝姓女童遭到姦殺身亡的案件,軍方的專案偵辦小組速偵速審,將被認定涉案的江國慶於1997年8月13日執行槍決,被槍決時年僅21歲。

國家生日,不一定是快樂的事。

不過今天挺快樂的,每次去草嶺古道,都有不一樣的感覺,新鮮、熟悉、輕鬆、緊張、舒服、侷促。這次是什麼呢?在此略過不提。



從大里火車站出發,會先走一段小上坡,接著走小下坡,面對埡口啞口((MOMO訂正)),我們行進方向是往右的,雖然是下坡,但方向的確正確無誤,不必擔心。腳的話就還好,話說有人天天打胰島素走高山縱走,我想這一點小痛應該還好吧。然後摸摸手還在復原中,不能揹太重,也是不太妙的狀態。

第一次休息
這兒風不太大。

視野遠方是龜山島,最近剛去。



休息完小走一段,喵的,埡口啞口((Ibid))到了。風很瘋,站上眺望台遠眺龜山島,之前從401高地看台灣,肯定能夠看到草嶺古道。風隨山坡上的草芒快速流動,從另一面看,芒草被太陽畫出白白亮亮的色彩。我們得到一個重要結論:早上從大里開始走是顏色錯誤。顏色正確的走法是從貢寮火車站開始踢馬路慢慢走。

((Photo by MOMO))



接下來一直踢馬路,踢太多,腳頗不爽。那棵柚子樹還在,不過柚子已經過熟了。



回家大睡。好久沒有那麼累。

2012年10月5日 星期五

爬《金瓶梅》


完全制霸!!!((之前標題錯了))



還是會忍不住抱怨事情越來越多,弄到心情不太美麗。真正想改的東西尚未測試完畢,無所謂,一天二十四小時,就那麼多時間,多的事以後再做。



import logging
import os
import re

from lxml import html

from ..common import logger

class Sourcing():

    def __init__(self):
        self.__logger = logging.getLogger()
        self.URLS = []
        self.LOCAL_DIR = './dataset/jin_ping_mei/local/'
        self.TXT_DIR = './dataset/jin_ping_mei/txt/'
        self.TXT_AMALGAMATION = './dataset/jin_ping_mei/txt/AMALGAMATION.txt'

    def source(self):
        self.__init_urls()
        self.source_url_to_local(self.LOCAL_DIR)
        self.source_local_to_txt_batch(self.LOCAL_DIR, self.TXT_DIR)
        self.source_txt_to_amalgamation(self.TXT_DIR, self.TXT_AMALGAMATION)
     
    def source_url_to_local(self, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for url in self.URLS:
            self.__wget(url, dest_dir)

    def source_local_to_txt_batch(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            dest_file = os.path.join(dest_dir, file_name + ".txt")
            self.source_local_to_txt(os.path.join(src_dir, file), dest_file)

    def source_local_to_txt(self, src_file, dest_file):
        src_file_fd = open(src_file, encoding='utf-8')
        content = src_file_fd.read()
        src_file_fd.close()
     
        dest_file_fd = open(dest_file, 'w', encoding='utf-8')
        html_content = html.fromstring(content.replace('<br>', ''))
        for header in html_content.xpath('//html/body/div/div/div[@class="articleHeader"]/h1/text()'):
            dest_file_fd.write(header + '\n\n')
            break
        for content in html_content.xpath('//html/body/div/div/div[@id="content"]/div[@id="mw-content-text"]'):
            for row in content.xpath('./p/text()'):
                dest_file_fd.write(row + '\n\n')
        dest_file_fd.close()

    def source_txt_to_amalgamation(self, src_dir, dest_file):
        dest_file_fd = open(dest_file, 'w', encoding='utf-8')
        for i in range(1, 101):
            src_file = os.path.join(src_dir, '''%02d.txt''' % i)
            assert os.path.isfile(src_file)
            dest_file_fd.write(open(src_file, encoding='utf-8').read())
            dest_file_fd.write('\n\n')
        dest_file_fd.close()
         
    def __init_urls(self):
        URL_TEMPLATE = '''http://zh.wikisource.7val.com/wiki/%%E9%%87%%91%%E7%%93%%B6%%E6%%A2%%85/%%E7%%AC%%AC%02d%%E5%%9B%%9E'''
        self.URLS = [URL_TEMPLATE % _ for _ in range(1, 101)]      
         
    def __wget(self, url, dest_dir):
        dest_file = os.path.join(dest_dir, '''%s.html''' % url[72: url.index('%', 72)])
        wget = os.path.abspath('./src/thirdparty/wget/wget.exe')
        assert os.path.isfile(wget)          
        wget_cmdline = '''%s -U firefox -N \"%s\" --waitretry=3 -O %s''' % (wget, url, dest_file)
        os.system(wget_cmdline)          



我應該停止抱怨。星期五晚上跟同事分享爬網頁的心得,喪失的動力又回來了!來看剛剛改好的程式:


sourcing_base.py ((共用部分抽出來))

import csv
import logging
import os
import re
import shutil
import sqlite3

from lxml import html

from ..common import logger

class SourcingBase():

    def __init__(self):
        self.LOGGER = logging.getLogger()
        self.URL_TEMPLATE = \
            '''http://mops.twse.com.tw/mops/web/ajax_%s?TYPEK=all&TYPEK2=&checkbtn=&co_id=%s&code1=&encodeURIComponent=1&firstin=1&isnew=false&keyword4=&off=1&queryName=co_id&season=%02d&step=1&year=%d'''
        self.URLS = []
        self.LOCAL_DIR = './'
        self.DEFLATED_DIR = './'
        self.CSV_DIR = './'
        self.DB_FILE = './db/stocktotal.db'
        self.SQL_INSERT = ''
        self.WHITE_MSG = [
            '資料庫中查無需求資料',
            '無應編製合併財報之子公司',
            '外國發行人免申報個別財務報表資訊,請至合併財務報表查詢',
        ]
        self.SEASON_STR_MAP = {
            '01' : '-03-31',
            '02' : '-06-30',
            '03' : '-09-30',
            '04' : '-12-31'
        }
        self.REPORT_TYPE_MAP = {
            't05st32' : 'I', # Individual Income Statement
            't05st34' : 'C', # Consolidated Income Statement
            't05st36' : 'I', # Individual Cash Flow Statement
            't05st39' : 'C', # Consolidated Cash Flow Statement
            't05st31' : 'I', # Individual Balance Sheet
            't05st33' : 'C', # Consolidated Balance Sheet
        }

    def source_url_to_local(self, dest_dir):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        for url in self.URLS:
            self.__wget(url, dest_dir)
         
    def source_local_to_deflated(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            prog_name = file[5:12]
            args = self.__parse_args(file)
            html_file = '''%s_%s_%s_%s.html''' % \
                    (prog_name, args['co_id'], args['year'], args['season'])
            shutil.copy(os.path.join(src_dir, file), os.path.join(dest_dir, html_file))

    def source_deflated_to_csv(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            dest_file = os.path.join(dest_dir, file_name + '.csv')
            self.source_deflated_to_csv_single(os.path.join(src_dir, file), dest_file)

    def source_deflated_to_csv_single(self, src_file, dest_file):
        src_file_fd = open(src_file, 'rb')
        content = src_file_fd.read()
        src_file_fd.close()
   
        table = html.fromstring(content.decode('utf-8').replace('&nbsp;', ' '))
        xpath_no_record = table.xpath('//body/center/h3/text()')
        if len(xpath_no_record) is 1:
            with open(dest_file, 'w') as fd:
                fd.write(xpath_no_record[0].strip())
            return

        xpath_busy = table.xpath('//html/body/center/table/tr/td/font/center/text()')
        if len(xpath_busy) is 1:
            with open(dest_file, 'w') as fd:
                fd.write(xpath_busy[0].strip())
            return
         
        csv_writer = csv.writer(open(dest_file, 'w', newline=''))
        for tr in table.xpath('//tr'):
            tds = tr.xpath('./td/text()')
            if len(tds) is 5:
                csv_record = [tds[0].strip(), tds[1].strip(), tds[3].strip()]
                csv_writer.writerow(csv_record)          

    def source_csv_to_sqlite(self, src_dir, dest_db, sql_insert):
        assert os.path.isdir(src_dir)
        for file in os.listdir(src_dir):
            self.source_csv_to_sqlite_single(os.path.join(src_dir, file), dest_db, sql_insert)
         
    def source_csv_to_sqlite_single(self, src_file, dest_db, sql_insert):
        self.LOGGER.debug('''%s => %s''' % (src_file, dest_db))
        assert os.path.isfile(src_file)
        assert os.path.isfile(dest_db)
     
        file_name, file_ext = os.path.splitext(os.path.basename(src_file))
        report_code, stock_code, year, season = file_name.split('_')
        report_type = self.REPORT_TYPE_MAP[report_code]
        date = self.__get_date(year, season)
     
        conn = sqlite3.connect(dest_db)
        cursor = conn.cursor()      
        csv_reader = csv.reader(open(src_file, 'r'))
        for row in csv_reader:
            if len(row) is 1:
                msg = row[0]
                if msg in self.WHITE_MSG:
                    self.LOGGER.info('''%s => %s => No record''' % (src_file, msg))
                else:
                    self.LOGGER.error('''%s => %s''' % (src_file, msg))
            elif len(row) in (2, 3):
                cursor.execute(self.SQL_INSERT, \
                        (stock_code, report_type, date, date, row[0], row[1]))
            if len(row) is 3:
                last_date = self.__get_last_date(year, season)
                cursor.execute(self.SQL_INSERT, \
                        (stock_code, report_type, date, last_date, row[0], row[2]))      
        conn.commit()
        cursor.close()
        conn.close()
             
    def __get_date(self, year, season):
        return str(int(year) + 1911) + self.SEASON_STR_MAP[season]

    def __get_last_date(self, year, season):
        return str(int(year) + 1910) + self.SEASON_STR_MAP[season]      
     
    def __wget(self, url, dest_dir):
        url_to_filepath = re.compile('https?://|ftp://').sub('', url).replace(':', '_')
        dest_file = os.path.join(dest_dir, url_to_filepath)
        dest_file_dir = os.path.dirname(dest_file)
        if not os.path.exists(dest_file_dir):
            os.makedirs(dest_file_dir)

        wget = os.path.abspath('./src/thirdparty/wget/wget.exe')
        assert os.path.isfile(wget)
        wget_cmdline = '''%s -N \"%s\" --waitretry=3 -P %s''' % (wget, url, dest_file_dir)
        os.system(wget_cmdline)
     
    def __parse_args(self, args_line):
        args = {}
        for kvs in args_line.split('&'):
            kv = kvs.split('=')
            args[kv[0]] = kv[1]
        return args



sourcing.py ((爬現金流量表))

import csv
import logging
import os
import shutil

from lxml import html

from ..common import logger
from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.TEXT_DIR = ''
        self.ITEM_PREFIXES = {
            'Operating' : [
                '營業活動',
                '│營業活動'
            ],
            'Investing' : [
                '投資活動',
                '│投資活動'
            ],
            'Financing' : [
                '融資活動',
                '│融資活動',
                '理財活動',
                '不影響現金流量之融資活動'
            ],
        }
        self.SQL_INSERT = '''insert or ignore into
            CashFlowStmt(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)  
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
     
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        self.source_deflated_to_text(self.DEFLATED_DIR, self.TEXT_DIR)
        self.source_text_to_csv(self.TEXT_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)  
     
    def source_deflated_to_text(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            txt_file = os.path.join(dest_dir, file_name + '.txt')
            self.source_deflated_to_text_single(os.path.join(src_dir, file), txt_file)

    def source_deflated_to_text_single(self, src_file, dest_txt):
        if os.path.getsize(src_file) is 0:
            shutil.copy(src_file, dest_txt)
            return
     
        src_file_fd = open(src_file, 'rb')
        content = src_file_fd.read()
        src_file_fd.close()

        table = html.fromstring(content.decode('utf-8'))
        xpath_stmt = table.xpath('//body/table[@class="hasBorder"]/tr/td/pre/text()')
        if len(xpath_stmt) is 1:
            with open(dest_txt, 'w', encoding='utf-8') as fd:
                fd.write(xpath_stmt[0].strip())
            return

        xpath_no_record = table.xpath('//body/center/h3/text()')
        if len(xpath_no_record) is 1:
            with open(dest_txt, 'w', encoding='utf-8') as fd:
                fd.write(xpath_no_record[0].strip())
            return

        xpath_busy = table.xpath('//html/body/center/table/tr/td/font/center/text()')
        if len(xpath_busy) is 1:
            with open(dest_txt, 'w', encoding='utf-8') as fd:
                fd.write(xpath_busy[0].strip())
            return

    def source_text_to_csv(self, src_dir, dest_dir):
        assert os.path.isdir(src_dir)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        for file in os.listdir(src_dir):
            file_name, file_ext = os.path.splitext(file)
            csv_file = os.path.join(dest_dir, file_name + '.csv')
            self.source_text_to_csv_single(os.path.join(src_dir, file), csv_file)

    def source_text_to_csv_single(self, src_txt, dest_csv):
        self.LOGGER.debug('''%s => %s''' % (src_txt, dest_csv))

        fd = open(src_txt, 'rb')
        content = fd.read()
        fd.close()
        lines = content.decode('utf-8').split('\n')
     
        # No record
        if len(lines) is 1:
            msg = lines[0]
            if msg in self.WHITE_MSG:
                self.LOGGER.info('''%s => %s => No record''' % (src_txt, msg))
            else:
                self.LOGGER.error('''%s => %s''' % (src_txt, msg))
        # Has record
        else:
            items = self.__fetch_items(lines)
            rows = self.__build_records(src_txt, items)
            csv_writer = csv.writer(open(dest_csv, 'w', newline=''))
            csv_writer.writerows(rows)

    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/cash_flow_stmt/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/cash_flow_stmt/deflated/', stock_code)
        self.TEXT_DIR = os.path.join('./dataset/cash_flow_stmt/text/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/cash_flow_stmt/csv/', stock_code)
     
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st36', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st39', stock_code, season, year - 1911),
        ]

    def __fetch_items(self, lines):
        items = {
            'Operating' : [],
            'Investing' : [],
            'Financing' : [],
        }
        for line in lines:
            line_strip = line.strip()
            for key in items:
                for prefix in self.ITEM_PREFIXES[key]:
                    if line_strip.startswith(prefix):
                        items[key].append(line)
        for key in items:
            self.LOGGER.debug('''%s: %s''', key, items[key])
        return items

    def __build_records(self, src_txt, items):
        records = []
        for item in items:
            for line in items[item]:
                words = self.__split_words(line)
                if len(words) > 2:
                    number = self.__get_number(words[1])
                    last_number = self.__get_number(words[2])
                    record = [item, number, last_number]
                    records.append(record)
                    self.LOGGER.info('''record: %s''', record)
        return records      

    def __split_words(self, line):
        words = line.split()
        word_num = len(words)
        for i, word in enumerate(words):
            if (word == '(') or (word == '($'):
                next_i = i + 1
                if next_i < word_num:
                    words[next_i] = '(' + words[next_i]

        fixed_words = []
        for word in words:
            if (word != '') and (word != '(') and (word != '($') and (word != '$'):
                fixed_words.append(word)
        return fixed_words

    def __get_number(self, number):
        number = number.strip()
        number = number.replace('$', '').replace(',', '')
        if (number[0] == '(') and (number[-1] == ')'):
            number = '-' + number[1:-1]
        return number



sourcing.py ((爬資產負債表,以後會改名字,財務狀況表))

import os

from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.SQL_INSERT = '''insert or ignore into
            BalanceSheet(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)  
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
     
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        sourcing_base.SourcingBase.source_deflated_to_csv(self, self.DEFLATED_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)  
     
    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/balance_sheet/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/balance_sheet/deflated/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/balance_sheet/csv/', stock_code)
     
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st32', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st34', stock_code, season, year - 1911),
        ]



sourcing.py ((爬損益表))

import os

from ..common import sourcing_base

class Sourcing(sourcing_base.SourcingBase):

    def __init__(self):
        sourcing_base.SourcingBase.__init__(self)
        self.SQL_INSERT = '''insert or ignore into
            IncomeStmt(stock_code, report_type, report_date, activity_date, item, number)
            values(?, ?, ?, ?, ?, ?)
            '''

    def source_url_to_local(self, stock_code, year, season):
        self.__init_dirs(stock_code)
        self.__init_urls(stock_code, year, season)    
        sourcing_base.SourcingBase.source_url_to_local(self, self.LOCAL_DIR)
        
    def source_local_to_sqlite(self, stock_code):
        self.__init_dirs(stock_code)
        local_file_dir = os.path.join(self.LOCAL_DIR, 'mops.twse.com.tw\mops\web')
        sourcing_base.SourcingBase.source_local_to_deflated(self, local_file_dir, self.DEFLATED_DIR)
        sourcing_base.SourcingBase.source_deflated_to_csv(self, self.DEFLATED_DIR, self.CSV_DIR)
        sourcing_base.SourcingBase.source_csv_to_sqlite(self, self.CSV_DIR, self.DB_FILE, self.SQL_INSERT)    
        
    def __init_dirs(self, stock_code):
        self.LOCAL_DIR = os.path.join('./dataset/income_stmt/local/', stock_code)
        self.DEFLATED_DIR = os.path.join('./dataset/income_stmt/deflated/', stock_code)
        self.CSV_DIR = os.path.join('./dataset/income_stmt/csv/', stock_code)
        
    def __init_urls(self, stock_code, year, season):
        self.URLS = [
            self.URL_TEMPLATE % ('t05st32', stock_code, season, year - 1911),
            self.URL_TEMPLATE % ('t05st34', stock_code, season, year - 1911),
        ]


2012年10月4日 星期四

也太無感了一點


前幾天領錢,帳戶數字變多,但不曉得增加多少,當金錢變成數字,很難感受這些數字能做什麼事。或者說,現在還真沒事可以做。聽沙拉說是 MBO,阿班還故意重複 MBO 的縮寫來由,反正數字變多是有原因的。其實做啥工作都一樣,扣除負債後,帳戶數字變多才是真的。至於多少才算多,那得看個人。慾望少,數字少,那便無所謂。




來看點正經事,攸關帳戶數字成長動能。這算正經事嗎?

證券市場統計概要與市場總市值、投資報酬率、本益比、殖利率一覽表月報。今年一月到八月的 PER (本益比):16.74, 18.11, 20.68, 22.72, 21.67, 21.68, 21.74, 24.47。24.47 代表投資一萬元,本金無法贖回,只能領企業盈餘的狀況下 (長期投資),要等 24.47年才能回本。也就是說,今年小弟三十,他媽的我要等到五十四歲又五個月又二十天,才能回本。原則上 PER = 10 才是大環境比較好的買點

郭老大說過:「全球總體經濟趨勢處於復甦期,股價指數也將處於多頭循環,但如果您找不到價廉物美且適合自己的股票,那就繼續觀望或減碼吧!」

台灣的大環境,全球的大環境,可以自行比較。說來說去,以上都不是重點。重點是賺錢。所以要會忍耐。




說到忍耐,鵝足肌已經忍了三個月,媽的還是有痛的感覺。不過還是計畫爬山行程,首先是大霸群峰,接下來是聖稜線,跨年跟教授走南湖群峰。明年走西班牙 ((完全與爬山無關))。