2017年9月2日 星期六

python HTMLParser

# -*- coding: utf-8 -*-

from html.parser import HTMLParser
import logging
import urllib3
import re

class MyHTMLParser(HTMLParser):
    test = []
    startFlag = False
    def handle_data(self, data):
        new_data = data.replace(' ','').replace(',','')
        
        ret = re.match(r'^\d{4}$', new_data) #只需要4位數的股票,權證之類不用.
        
        if ret != None and self.startFlag == False:
            self.startFlag = True
            self.test.append(new_data)  
        elif len(self.test) < 10 and self.startFlag == True:
            self.test.append(new_data)
        elif len(self.test) == 10:
            for w in self.test:
                print(w, end ='*')
            print('  ')
            print('===============')
            self.startFlag = False
            self.test[:] = []
            

def main():
    http = urllib3.PoolManager()
    r = http.request('GET', 'http://mops.twse.com.tw/nas/t21/sii/t21sc03_106_6_0.html')
    
    FORMAT = '%(message)s'
    logging.basicConfig(level=logging.DEBUG, format=FORMAT, filename='system.log')
    
    #logging.debug(r.data.decode('big5', 'ignore'))
    #print(r.data)
    parser = MyHTMLParser()
    parser.feed(r.data.decode('big5', 'ignore'))
    #gg = '              5,106,247'
    #print(gg.replace(' ', '').replace(',', ''))

if __name__ == '__main__':
    main()


效果:
1213*大飲*46321*49849*70686*-7.07*-34.46*295094*312894*-5.68*

沒有留言:

張貼留言