量化交易其中一個因子就是籌碼面,至於如何取得籌碼面的資料,則需要索取台灣證券交易所與證券櫃檯買賣中心的買賣日報表,基本上可以付費取得。如果想要免費取得的話,就需要自己寫一個爬網頁的機器人每天在收盤之後去索取買賣日報表。
這一篇文章就是筆者自己透過C#去證券櫃檯買賣中心爬取買賣日報表的程式介紹。證券櫃檯買賣中心為了避免大量機器人索取資料,造成系統負荷,採用了reCaptcha的方式阻擋機器人,如果是真人操作,需要選取提示所需圖片,並勾選reCaptcha我不是機器人的欄位。這種方式對於機器人來說,相當不方便繞過,還好reCaptcha驗證有提供盲人語音提示的方式,只要聽到撥放的聲音,並且將這段聲音轉為文字。就可以通過reCaptcha驗證。
在此我們採用Selenium套件來處理網頁資料互動,透過System.Speech將語音轉為文字,一旦通過驗證,就可以抓取指定股票的買賣日報表。依據個人經驗,一次驗證成功可以抓取五筆到十筆股票資料。如果被證券櫃檯買賣中心的伺服器阻擋,就先休息個五分鐘到二十分鐘,再嘗試下一次的驗證與資料的擷取。
必須要通過reCaptcha驗證之後,索取買賣日報表才能正常互動。以下就是主程式部分。
private bool TPEXThread(List<StockList> codelist, string date)
{
int NUMBER_OF_ITERATIONS = 0;
bool completed = false;
string reqtpex = "https://www.tpex.org.tw/web/stock/aftertrading/broker_trading/brokerBS.php?l=zh-tw";
string dlurl = "https://www.tpex.org.tw/web/stock/aftertrading/broker_trading/download_ALLCSV.php";
Debug.WriteLine(DateTime.Now.ToString("MM/dd HH:mm:ss") + " TPEX Broker " + date);
while (!completed)
{
using (IWebDriver driver = OpenDriver())
{
int i = 0;
for (i = 0; i < codelist.Count; i++)
{
if (codelist[i].Kind == StockEnum.TPEX)
{
string code = codelist[i].ID;
string path = Path.Combine(@s_homepath, date, code + "_" + date + ".csv");
if (!File.Exists(path))
{
if (ReportCollect(dlurl, code, path) == false)
{
i--; // redo
try
{
if (solve(driver, reqtpex))
{
driver.SwitchTo().DefaultContent();
var stk = driver.FindElement(By.Name("stk_code"));
stk.Click();
stk.SendKeys(code);
driver.FindElement(By.XPath("//button[@type='button']")).Click();
}
else
{
Debug.WriteLine(DateTime.Now.ToString("MM/dd HH:mm:ss") + " Error solve " + code);
System.Threading.Thread.Sleep(300000); // 5 mins
break;
}
}
catch (NoSuchElementException)
{
Debug.WriteLine(DateTime.Now.ToString("MM/dd HH:mm:ss") + " Unable solve " + code);
System.Threading.Thread.Sleep(1200000); // 20 mins
break;
}
}
}
}
}
Debug.WriteLine(DateTime.Now.ToString("MM/dd HH:mm:ss") + " TPEX progress " + (decimal.Divide(i, codelist.Count) * 100).ToString("F") + "%");
completed = i == codelist.Count;
NUMBER_OF_ITERATIONS++;
}
}
Debug.WriteLine(DateTime.Now.ToString("MM/dd HH:mm:ss") + " Complete TPEX Broker " + NUMBER_OF_ITERATIONS);
return completed;
}
private bool ReportCollect(string dlurl, string code, string path)
{
string responseInString = Download_Report(dlurl, code);
if (responseInString.Count() > 0)
{
File.WriteAllText(path, "證券代碼 " + code + " 序號,券商,價格,買進股數,賣出股數\r\n");
int skipline = 0;
foreach (var line in responseInString.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries).Select(s => s.Replace(",", "")).Select(s => s.Replace(" ", "")).Select(s => s.Replace(" ", "")))
{
try
{
if (skipline++ < 3) continue;
string[] csv = line.Split("\"");
string strData = string.Empty;
if (csv.Count() == 21)
{
strData = string.Format("{0},{1},{2},{3},{4}\r\n{5},{6},{7},{8},{9}\r\n",
csv[1], csv[3], csv[5], csv[7], csv[9], csv[11], csv[13], csv[15], csv[17], csv[19]
);
}
else if (csv.Count() == 11)
{
strData = string.Format("{0},{1},{2},{3},{4}\r\n",
csv[1], csv[3], csv[5], csv[7], csv[9]
);
}
else
{
//Debug.WriteLine("Err in " + code + " " + skipline);
File.Delete(path);
return false;
}
File.AppendAllText(path, strData);
}
catch
{
; // ignore exception
//Debug.WriteLine("Err File " + code);
File.Delete(path);
return false;
}
}
//Debug.WriteLine("OK download " + code);
System.Threading.Thread.Sleep(new Random().Next(MIN_RAND/2, MAX_RAND/2));
return true;
}
else
{
//Debug.WriteLine("Err download " + code);
System.Threading.Thread.Sleep(1000);
return false;
}
}
其中Selenium套件中,我們採用Chrome瀏覽器的格式與網頁互動。所以開啟網頁需要設定Chrome瀏覽器的參數,並依據您電腦上所使用的Chrome瀏覽器版本,定期更新Chrome瀏覽器的套件插件。如果套件插件版本和瀏覽器版本不一致時,就會發生錯誤。這個算是筆者的經驗教訓之一。
private IWebDriver OpenDriver()
{
ChromeOptions chromeoptions = new ChromeOptions();
chromeoptions.AddArgument("headless");
var chromedriverService = ChromeDriverService.CreateDefaultService(Environment.CurrentDirectory);
chromedriverService.HideCommandPromptWindow = true;
IWebDriver _driver = new ChromeDriver(chromedriverService, chromeoptions);
//IWebDriver _driver = new ChromeDriver(chromedriverService);
_driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromSeconds(5);
_driver.Manage().Timeouts().AsynchronousJavaScript = TimeSpan.FromSeconds(5);
_driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(15);
return _driver;
}
private bool is_exists_by_xpath(IWebDriver driver, string xpath)
{
try
{
driver.FindElement(By.XPath(xpath));
}
catch (NoSuchElementException)
{
return false;
}
return true;
}
private bool is_interactable_by_id(IWebDriver driver, string element_id)
{
try
{
driver.FindElement(By.Id(element_id));
driver.FindElement(By.Id(element_id)).SendKeys(Keys.Delete);
}
catch (NoSuchElementException)
{
return false;
}
catch (ElementNotInteractableException)
{
return false;
}
return true;
}
一旦成功開啟網站內容之後,就需要嘗試去reCaptcha裡面切換成語音驗證方式,以下程式就是這些切換過程,有點複雜。基本上就是尋找特定字串,並且去做對應的互動。
private bool solve_audio_challenge(IWebDriver driver, string RECAPTCHA_PAGE_URL)
{
try
{
if (!is_exists_by_xpath(driver, "//a[@class='rc-audiochallenge-tdownload-link']") &&
!is_exists_by_xpath(driver, "//div[@class='rc-textchallenge-control']"))
{
Debug.WriteLine("No element in audio challenge download link!!");
driver.Navigate().GoToUrl(RECAPTCHA_PAGE_URL);
Debug.WriteLine("Sleeping for a while now (6 to 9 minutes)");
Thread.Sleep(new Random().Next(REAL_LONG_MIN_RAND, REAL_LONG_MAX_RAND));
return false;
}
var download_object = driver.FindElement(By.XPath("//a[@class='rc-audiochallenge-tdownload-link']"));
var download_link = download_object.GetAttribute("href");
var converted_audio = get_challenge_audio(download_link);
if (converted_audio == string.Empty)
{
return false;
}
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
var audio_output = speech_to_text(converted_audio);
if (audio_output.Count() == 0)
{
return false;
}
if (!is_interactable_by_id(driver, "audio-response"))
{
return false;
}
driver.FindElement(By.Id("audio-response")).SendKeys(audio_output);
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
driver.FindElement(By.Id("recaptcha-verify-button")).Click();
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
return true;
}
catch
{
Debug.WriteLine("Exception solve audio challenge");
return false;
}
}
private bool solve(IWebDriver driver, string RECAPTCHA_PAGE_URL)
{
try
{
if (get_recaptcha_challenge(driver, RECAPTCHA_PAGE_URL) == false)
{
driver.SwitchTo().DefaultContent();
var iframes = driver.FindElements(By.TagName("iframe"));
if (!get_audio_challenge(driver, iframes))
{
return false;
}
if (!solve_audio_challenge(driver, RECAPTCHA_PAGE_URL))
{
return false;
}
int solve_more_count = 3;
for (int i = 0; i < solve_more_count; i++)
{
driver.SwitchTo().DefaultContent();
driver.SwitchTo().Frame(iframes[0]);
if (is_exists_by_xpath(driver, "//span[@aria-checked='true']"))
{
return true;
}
driver.SwitchTo().DefaultContent();
iframes = driver.FindElements(By.TagName("iframe"));
driver.SwitchTo().Frame(iframes[2]);
if (is_exists_by_xpath(driver, "//div[@class='rc-audiochallenge-error-message']") &&
is_exists_by_xpath(driver, "//div[contains(text(), 'Multiple correct solutions required')]"))
{
Debug.WriteLine("Need to solve more. Let's do this!");
solve_audio_challenge(driver, RECAPTCHA_PAGE_URL);
}
else
{
Debug.WriteLine("Unable to find audio challenge");
return false;
}
}
}
else
{
return true;
}
}
catch
{
Debug.WriteLine("Exception solve");
}
return false;
}
private bool get_recaptcha_challenge(IWebDriver driver, string RECAPTCHA_PAGE_URL)
{
for (int i = 0; i < 3; i++)
{
try
{
driver.Navigate().GoToUrl(RECAPTCHA_PAGE_URL);
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
var iframes = driver.FindElements(By.TagName("iframe"));
if (iframes.Count == 0)
{
Debug.WriteLine("ReCaptcha is empty. Trying again!");
Thread.Sleep(new Random().Next(REAL_LONG_MIN_RAND, REAL_LONG_MAX_RAND));
return true;
}
driver.SwitchTo().Frame(iframes[0]);
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
if (!is_exists_by_xpath(driver, "//div[@class='recaptcha-checkbox-border' and @role='presentation']"))
{
Debug.WriteLine("No element in the frame!!");
continue;
}
driver.FindElement(By.XPath("//div[@class='recaptcha-checkbox-border' and @role='presentation']")).Click();
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
if (is_exists_by_xpath(driver, "//span[@aria-checked='true']"))
{
Debug.WriteLine("ReCaptcha has no challenge. Trying again!");
return true;
}
else
{
return false;
}
}
catch (NoSuchElementException)
{
Debug.WriteLine("Exception no such element. Trying again!");
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
}
catch (WebDriverTimeoutException)
{
Debug.WriteLine("Exception no URL response. Trying again!");
Thread.Sleep(new Random().Next(REAL_LONG_MIN_RAND, REAL_LONG_MAX_RAND));
}
}
return false;
}
private bool get_audio_challenge(IWebDriver driver, ReadOnlyCollection<IWebElement> iframes)
{
try
{
driver.SwitchTo().Frame(iframes[2]);
if (!is_exists_by_xpath(driver, "//button[@id='recaptcha-audio-button']"))
{
Debug.WriteLine("No element of audio challenge!!");
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
return false;
}
//Debug.WriteLine("Clicking on audio challenge");
driver.FindElement(By.XPath("//button[@id='recaptcha-audio-button']")).Click();
Thread.Sleep(new Random().Next(MIN_RAND, MAX_RAND));
return true;
}
catch (NoSuchElementException)
{
Debug.WriteLine("No such element of audio challenge!!");
Thread.Sleep(new Random().Next(LONG_MIN_RAND, REAL_LONG_MAX_RAND));
return false;
}
}
接下來就是把語音檔從MP3格式轉乘WAV格式,並且透過System.Speech套件轉成文字,並且填回網頁,如果順利的話,其實不用全對,reCaptcha也會讓你通過認證,我想大概就算真人,也沒辦法打對所有的文字。
private string get_challenge_audio(string url)
{
try
{
// TODO: instead of file by stream
string _inPath_ = @s_temppath + "audio.mp3";
string _outPath_ = @s_temppath + "audio.wav";
using (WebClient webClient = new WebClient())
{
webClient.DownloadFile(new Uri(url), _inPath_);
}
using (Mp3FileReader mp3 = new Mp3FileReader(_inPath_))
{
using (WaveStream pcm = WaveFormatConversionStream.CreatePcmStream(mp3))
{
WaveFileWriter.CreateWaveFile(_outPath_, pcm);
}
}
return _outPath_;
}
catch
{
Debug.WriteLine("exception download challenge audio file");
return string.Empty;
}
}
private string speech_to_text(string audio_source)
{
if (!File.Exists(audio_source)) return string.Empty;
using (SpeechRecognitionEngine sre = new SpeechRecognitionEngine())
{
Grammar gr = new DictationGrammar();
sre.LoadGrammar(gr);
sre.SetInputToWaveFile(audio_source);
sre.BabbleTimeout = new TimeSpan(Int32.MaxValue);
sre.InitialSilenceTimeout = new TimeSpan(Int32.MaxValue);
sre.EndSilenceTimeout = new TimeSpan(100000000);
sre.EndSilenceTimeoutAmbiguous = new TimeSpan(100000000);
StringBuilder sb = new StringBuilder();
while (true)
{
try
{
var recText = sre.Recognize();
if (recText == null)
{
break;
}
sb.Append(recText.Text);
}
catch (InvalidOperationException ex)
{
//...
//Debug.WriteLine(ex.ToString());
break;
}
catch
{
//handle exception
break;
}
}
//Debug.WriteLine("speech: " + sb.ToString());
return sb.ToString();
}
}
基本上reCaptcha驗證比較複雜,而且聽說Google還打算改得更複雜,如果語音驗證方式沒有被移除的話,這應該還是機器人通過驗證比較可行的方式。
好啦,這篇介紹就先到這邊結束,希望大家都能夠賺大錢!