Web Scraping in ASP.NET
I know that there are other samples of web scraping out there, but here's mine. One of my customers asked me how to scrape our ASP.NET Web application, so I though that I might post the example code. I like the viewstate regex - it's my first time using lookarounds in a regular expression.
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
namespace Dennany.WebScrape {
class MainClass {
[STAThread]
static void Main(string[] args) {
try {
// Modify as appropriate:
const string baseUri = "http://remotewebhost/webpagedirectory/";
const string loginDlgUri = baseUri + "LoginDlg.aspx";
const string mainConsoleUri = baseUri + "Mainpage.aspx";
const string username = "myuser";
const string password = "p@ssw0rd";
// This cookie container will persist the ASP.NET session ID cookie
CookieContainer cookies = new CookieContainer();
// perform the first http request against
// the asp.net application login dialog.
HttpWebRequest request =
(HttpWebRequest) WebRequest.Create(loginDlgUri);
//get the response object, so that we may get the session cookie.
HttpWebResponse response =
(HttpWebResponse)request.GetResponse();
// populate the cookie container.
request.CookieContainer = cookies;
response.Cookies =
request.CookieContainer.GetCookies(request.RequestUri);
// read the incoming stream containing the login dialog page.
StreamReader reader =
new StreamReader(response.GetResponseStream());
string loginDlgPage = reader.ReadToEnd();
reader.Close();
// extract the viewstate value from the login dialog page.
// We need to post this back,
// along with the username and password
string viewState = GetViewState(loginDlgPage);
// build postback string
// This string will vary depending on the page. The best
// way to find out what your postback should look like is to
// monitor a normal login using a utility like TCPTrace.
string postback =
String.Format("__VIEWSTATE={0}&txtUserName={1}" +
"&txtPassword={2}&txtMessage=&btnOK=OK",
viewState, username, password);
// our second request is the POST of the username / password data.
HttpWebRequest request2 =
(HttpWebRequest)WebRequest.Create(loginDlgUri);
request2.Method = "POST";
request2.ContentType = "application/x-www-form-urlencoded";
request2.CookieContainer = cookies;
// write our postback data into the request stream
StreamWriter writer =
new StreamWriter(request2.GetRequestStream());
writer.Write(postback);
writer.Close();
request2.GetResponse().Close();
// our third request is for the actual webpage after the login.
HttpWebRequest request3 =
(HttpWebRequest)WebRequest.Create(mainConsoleUri);
request3.CookieContainer = cookies;
reader =
new StreamReader(request3.GetResponse().GetResponseStream());
// and read the response
string page = reader.ReadToEnd();
reader.Close();
// our webpage data is in the 'page' string.
Console.WriteLine(page);
}
catch(Exception ex) {
Console.WriteLine(ex);
}
}
// extract the viewstate data from a page.
private static string GetViewState(string aspxPage) {
Regex regex =
new Regex("(?<=(__viewstate\".value.\")).*(?=\"./>)",RegexOptions.IgnoreCase);
Match match =
regex.Match(aspxPage);
return System.Web.HttpUtility.UrlEncode(match.Value);
}
}
}
// EOF