2024-09-16|閱讀時間 ‧ 約 25 分鐘

籌碼面 - 台灣證券交易所買賣日報表查詢系統

自從筆者開始研究量化交易之後,就聽說股價跟籌碼面脫不了關係,其中的重點就是要分析分點交易狀況。台灣證券交易所與證券櫃檯買賣中心都有提供買賣日報表查詢,但是為了避免太多機器人爬蟲造成主機負擔,一個需要輸入圖形驗證碼,一個需要勾選reCaptcha我不是機器人的欄位。
這一篇就先以程式自動輸入圖形驗證碼,並取得證券交易所每一檔股票的買賣日報表當作範例。
台灣證券交易所的買賣日報表查詢網路連結為https://bsr.twse.com.tw/bshtm/bsMenu.aspx。主要的邏輯架構也不難,先把台灣證券交易所的網頁抓下來,找到辨識驗證碼的欄位,透過OpenCV與OCR把圖型轉成數字,填回數字以及想要抓取的股票代碼,按下網頁確定的按鈕,網站就會吐出該股票代碼的買賣日報表,我們把這些資料做個欄位處理之後,存成一個文字檔案,再由其他程式做籌碼計算。
這邊要注意的是OCR套件不一定每次都可以辨識出正確的數字,因為台灣證券交易所的數字長度永遠都是5,所以我們可以在辨識的結尾判斷長度,如果不正確,就重新判斷一次。
另外就是台灣證券交易所如果偵測到大量的索取要求,將會直接把IP鎖住一陣子,所以在每次索取要求之間,我們都先暫停一段時間,這邊我們是設定一分鐘。主要程式碼就如下所示:
        private async Task<bool> GetTWSEBroker(string code, string path)
        {
            // https://bsr.twse.com.tw/bshtm/bsMenu.aspx
            // https://bsr.twse.com.tw/bshtm/bsContent.aspx

            int dlretry = 0;
            int ocrretry = 0;
        begin:
            string reqtwse = "https://bsr.twse.com.tw/bshtm/bsMenu.aspx";
            CookieContainer _cookies = new CookieContainer();
            string csvtwse = await GetAsync(reqtwse, _cookies, 65001);
        again:
            string Captcha = FindImg(csvtwse);
            //Debug.WriteLine(code + " 辨識驗證碼 " + Captcha);
            if (Captcha.Length != 5)
            {
                if (ocrretry++ < 5)
                {
                    goto again;
                }
                else
                {
                    Debug.WriteLine(code + " 辨識驗證碼錯誤 " + Captcha);
                }
            }

            NameValueCollection postParams = System.Web.HttpUtility.ParseQueryString(string.Empty);
            postParams.Add("__EVENTTARGET", "");
            postParams.Add("__EVENTARGUMNET", "");
            postParams.Add("__LASTFOCUS", "");
            postParams.Add("__VIEWSTATE", FindToken(csvtwse, "__VIEWSTATE"));
            postParams.Add("__VIEWSTATEGENERATOR", FindToken(csvtwse, "__VIEWSTATEGENERATOR"));
            postParams.Add("__EVENTVALIDATION", FindToken(csvtwse, "__EVENTVALIDATION"));
            postParams.Add("RadioButton_Normal", "RadioButton_Normal");
            postParams.Add("TextBox_Stkno", code);
            postParams.Add("CaptchaControl1", Captcha);
            postParams.Add("btnOK", "%E6%9F%A5%E8%A9%A2");

            string responseInString = SetAsync(reqtwse, _cookies, postParams);
            //Debug.WriteLine(responseInString);
            if (FindContent(responseInString))
            {
                string csvurl = "https://bsr.twse.com.tw/bshtm/bsContent.aspx";
                string csvline = await GetAsync(csvurl, _cookies, 950);

                if (csvline.Count() > 0)
                {
                    File.WriteAllText(path, "證券代碼 " + code + " 序號,券商,價格,買進股數,賣出股數\r\n");
                    int skipline = 0;
                    foreach (var line in csvline.Split(new string[] { "\n" }, StringSplitOptions.RemoveEmptyEntries).Select(s => s.Replace(" ", "")).Select(s => s.Replace(" ", "")))
                    {
                        if (skipline++ < 3) continue;

                        try
                        {
                            string[] csv = line.Split(",");
                            string strData = string.Empty;
                            if (csv.Count() == 11)
                            {
                                if (csv[6] != "")
                                {
                                    strData = string.Format("{0},{1},{2},{3},{4}\r\n{5},{6},{7},{8},{9}\r\n",
                                        csv[0], csv[1], csv[2], csv[3], csv[4], csv[6], csv[7], csv[8], csv[9], csv[10]
                                        );
                                }
                                else
                                {
                                    strData = string.Format("{0},{1},{2},{3},{4}\r\n",
                                        csv[0], csv[1], csv[2], csv[3], csv[4]
                                        );
                                }

                                File.AppendAllText(path, strData);
                            }
                            else
                            {
                                File.Delete(path);
                                Debug.WriteLine("TWSE Err inline " + code);
                            }
                        }
                        catch
                        {
                            File.Delete(path); 
                            Debug.WriteLine("TWSE Err File " + code);
                        }
                    }
                }

                return true;
            }
            else if (dlretry++ < 5)
            {
                //Debug.WriteLine("驗證碼錯誤");
                Thread.Sleep(60000);
                goto begin;
            }

            Debug.WriteLine("TWSE Err download " + code);
            return false;
        }

為了可以取得網頁上面的資料,在這邊我們採用HtmlAgilityPack的HtmlDocument套件,幫助我們把網頁內容抓下來,並提取指定的欄位,其中辦識圖片中的數字,可以透過OpenCV先做圖片的切割,再透過Tesseract OCR的幫助,把每個圖片數字轉成實際數值,所以使用上要在C#先安裝HtmlAgilityPack、OpenCvSharp4以及Tesseract的套件。
我把相關的程式碼都放在下方,提供大家做參考。
        private string FindImg(string webdata)
        {
            try
            {
                if (webdata.Length == 0)
                {
                    Debug.WriteLine("CaptchaImage has no challenge. Sleeping for a while now.");
                    Thread.Sleep(1800000);
                    return string.Empty;
                }

                string imgname = string.Empty;

                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(webdata);

                foreach (HtmlNode tag in document.DocumentNode.SelectNodes("//img"))
                {
                    if (tag.Attributes["src"].Value.Contains("CaptchaImage"))
                    {
                        imgname = tag.Attributes["src"].Value;
                    }
                }
                
                string fname = @s_temppath + "CaptchaImage.JFIF";
                if (File.Exists(fname))
                {
                    File.Delete(fname);
                }

                using (WebClient webClient = new WebClient())
                {
                    webClient.DownloadFile(new Uri("https://bsr.twse.com.tw/bshtm/" + imgname), fname); 
                    //webClient.DownloadData
                }

                return DecodeCaptcha(@s_temppath + "CaptchaImage.JFIF");
            }
            catch (NullReferenceException)
            {
                // web return no page
                Debug.WriteLine("CaptchaImage has no challenge. Sleeping for a while now.");
                Thread.Sleep(1800000);                
            }
            catch
            {
                //Debug.WriteLine("Exception in CaptchaImage"); 
                Thread.Sleep(5000);
            }
            return string.Empty;
        }
        private string FindToken(string webdata, string item)
        {
            try
            {
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(webdata);
                string description = string.Empty;
                foreach (HtmlNode tag in document.DocumentNode.SelectNodes("//input"))
                {
                    if (tag.Attributes["name"] != null && tag.Attributes["id"] != null && tag.Attributes["name"].Value == item)
                    {
                        description = tag.Attributes["value"].Value;
                    }
                }
                return description;
                //return System.Web.HttpUtility.UrlEncode(description); 
            }
            catch (NullReferenceException)
            {
                // web return no page
                Debug.WriteLine("CaptchaImage has no challenge. Sleeping for a while now.");
                Thread.Sleep(1800000);
            }
            catch
            {
                ;
            }
            return string.Empty;
        }
        private string DecodeCaptcha(string img)
        {
            var ocrtext = string.Empty;
            // OpenCV#
            using Mat captcha = new Mat(img, ImreadModes.Grayscale);
            if (captcha.Empty())
            {
                return ocrtext;
            }

            // Convert the captcha to black and white.
            using Mat captcha_bw = new Mat();
            Cv2.Threshold(captcha, captcha_bw, 128, 255, ThresholdTypes.Binary | ThresholdTypes.Otsu);

            // Erode the image to remove dot noise and that wierd line. I use a 3x3 rectengal as the kernal.
            using Mat captcha_erode = new Mat();
            using Mat element = Cv2.GetStructuringElement(MorphShapes.Rect, new Size(3, 3));
            Cv2.Erode(captcha_bw, captcha_erode, element);

            // Some cosmetic
            using Mat captcha_denoise = new Mat();
            Cv2.FastNlMeansDenoising(captcha_erode, captcha_denoise, 50);

            byte[] buffer = captcha_denoise.ToMemoryStream().ToArray();

            // OCR
            var path = Path.GetDirectoryName(Assembly.GetExecutingAssembly().CodeBase);
            path = Path.Combine(path, "tessdata");
            path = path.Replace("file:\\", "");

            using (var engine = new TesseractEngine(path, "eng", EngineMode.Default))
            {
                engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ");
                engine.SetVariable("tessedit_unrej_any_wd", true);
                using (var ocr = Pix.LoadFromMemory(buffer))
                {
                    using (var page = engine.Process(ocr))
                    {
                        ocrtext = page.GetText();
                    }
                }
            }

            return ocrtext.Trim('\n');
        }
一開始,我對於網頁互動爬取資料也不是很熟悉,所以土炮煉製了這一大段的程式去抓資料,中間走過不少冤枉路,雖然我實在找不到籌碼面怎麼樣可以幫助程式交易,但是還是把這些結果記錄下來,留個紀念。
分享至
成為作者繼續創作的動力吧!
© 2024 vocus All rights reserved.