In the last post we talked about about how Google 'hides' its price data behind its own custom id's and how we can retrieve and match them to real EPICs for our own uses. In this post we will cover the actual exercise of scraping price data and the pitfalls to watch out for. A word of warning before we progress, the code listed below is not as well documented as I would like, however please ask if you have questions.
I download pricing data everyday for all companies in the FTSE100 and FTSE250. I also download 6 years-worth of data for each EPIC
every day (750,000 records). You may ask why delete and re-download everything every day? The reason is, I believe there are errors in the daily data that are sometimes corrected by Google and I wouldn't catch these 'corrections' if I didn't download everyday. I may be very wrong in this opinion, but it makes me feel happier.
The actual 'work' of fetching the Google price data is covered by the following code:
protected static DataTable DownLoadPricesFromGoogle(DateTime latestDate,
DateTime earliestDate, string googleCid)
{
Begin:
string urlTemplate =
@"http://finance.google.co.uk/finance/historical?cid=
[symbol]&startdate=[earliestMonth]%2F[earliestDay]%2F[earliestYear]
&enddate=[latestMonth]%2F[latestDay]%2F[latestYear]&output=csv";
string latestMonth = latestDate.Month.ToString("00");
string latestDay = latestDate.Day.ToString("00");
string latestYear = latestDate.Year.ToString("00").Substring(2, 2);
string earliestMonth = earliestDate.Month.ToString("00");
string earliestDay = earliestDate.Day.ToString("00");
string earliestYear = earliestDate.Year.ToString("00").Substring(2, 2);
urlTemplate = urlTemplate.Replace("[symbol]", googleCid);
urlTemplate = urlTemplate.Replace("[latestMonth]", latestMonth);
urlTemplate = urlTemplate.Replace("[latestDay]", latestDay);
urlTemplate = urlTemplate.Replace("[latestYear]", latestYear);
urlTemplate = urlTemplate.Replace("[earliestMonth]", earliestMonth);
urlTemplate = urlTemplate.Replace("[earliestDay]", earliestDay);
urlTemplate = urlTemplate.Replace("[earliestYear]", earliestYear);
string history;
var wc = new WebClient();
try
{
history = wc.DownloadString(urlTemplate);
Thread.Sleep(1000);
}
catch (WebException ex)
{
if (ex.Status == WebExceptionStatus.ProtocolError)
{
return null;
}
Thread.Sleep(3000);
goto Begin;
}
finally
{
wc.Dispose();
}
return Parse(history, true);
}
The 'Begin:' at the top of the code is a horrible 'kludge' put in to get around any delay in fetching the page - I have as yet not found a better way. The rest of the code is fairly self explanatory.
The 'Parse' method converts the document stream into a datatable - to be honest I should get around to rewriting this as there is really no need for this and a much simpler method such as a simple data transport object would suffice:
public static DataTable Parse(TextReader stream, bool headers)
{
var table = new DataTable();
table.TableNewRow += Table_NewRow;
var csv = new CsvStream(stream);
string[] row = csv.GetNextRow();
if (row == null)
return null;
if (headers)
{
foreach (string header in row)
{
if (!string.IsNullOrEmpty(header) && !table.Columns.Contains(header))
if (header.Contains("Date"))
{
table.Columns.Add("Date", typeof(string));
}
else
{
table.Columns.Add(header, typeof(string));
}
else
table.Columns.Add(GetNextColumnHeader(table), typeof(string));
}
row = csv.GetNextRow();
}
while (row != null)
{
while (row.Length > table.Columns.Count)
table.Columns.Add(GetNextColumnHeader(table), typeof(string));
if (String.IsNullOrEmpty(row[1]))
{
Debug.WriteLine("`");
}
table.Rows.Add(row);
row = csv.GetNextRow();
}
return table;
}
private static void Table_NewRow(object sender, DataTableNewRowEventArgs e)
{
Console.WriteLine("Table_NewRow Event: RowState={0}",
e.Row.RowState);
}
private static string GetNextColumnHeader(DataTable table)
{
int c = 1;
while (true)
{
string h = "Column" + c++;
if (!table.Columns.Contains(h))
return h;
}
}
The 'RefreshPriceData' method controls the program flow of the 'scrape' - key points to note are:
public void RefreshPriceData()
{
_db = new DataClassesDataContext();
_db.Connection.ConnectionString = Database.ConnectionString;
//Delete all prices first
Persistence.ExecuteDelete();
string path = Environment.CurrentDirectory;
path = string.Format("{0}\\", path);
if (!File.Exists(path + "GoogleEpicCodes.xml"))
{
FetchGoogleCIDs();
}
DataTable gId = GetGoogleCids(path + "GoogleEpicCodes.xml");
IQueryable<string> epics = (from e in _db.StatsLists
where e.FTSE_Index == "FTSE100" || e.FTSE_Index == "FTSE250"
orderby e.EPIC
select e.EPIC);
DataTable _dt = Persistence.CreateDataTable();
var keys = new DataColumn[2];
keys[0] = _dt.Columns["EPIC"];
keys[1] = _dt.Columns["Date"];
int count = 0;
foreach (DataRow row in gId.Rows)
{
if (epics.Contains(row[0]))
{
Console.WriteLine("Starting on " + row[0]);
DataTable tempdt = DownLoadPricesFromGoogle(row[1].ToString());
if (tempdt == null)
{
continue;
}
var keysFind = new object[2];
foreach (DataRow dataRow in tempdt.Rows)
{
DataRow newRow = _dt.NewRow();
newRow["EPIC"] = row[0];
newRow["Date"] =
DateTime.ParseExact(dataRow["Date"].ToString(), "d-MMM-yy", null);
newRow["Open"] = Convert.ToDecimal(dataRow["Open"].ToString());
newRow["Volume"] = Convert.ToDecimal(dataRow["Volume"].ToString());
decimal high = Convert.ToDecimal(dataRow["High"].ToString());
decimal low = Convert.ToDecimal(dataRow["Low"].ToString());
decimal close = Convert.ToDecimal(dataRow["Close"].ToString());
newRow["High"] = high;
newRow["Low"] = low;
newRow["Close"] = close;
keysFind[0] = row[0];
keysFind[1] =
DateTime.ParseExact(dataRow["Date"].ToString(), "d-MMM-yy", null);
if (!_dt.Rows.Contains(keysFind))
{
_dt.Rows.Add(newRow);
}
}
if (count % 30 == 0 && count != 0)
{
Persistence.CopyData(_dt);
_dt.Rows.Clear();
}
count++;
}
Persistence.CopyData(_dt);
_dt.Rows.Clear();
}
}
Implementation of this code will allow you to easily 'scrape' the data you need. I run this every night prior to testing, each run netting some three-quarters of a million records within the space of about 15 minutes.
Next time we will discuss how to easily insert so many records into an SQL database.