web 2.0

Blocking not Robots.txt Compliant Crawlers

Hi,

Today an article talking about blocking Crawlers which are not Robots.txt Compliant. Number of web sites grows exponentially day by day and crawlers too on the Internet ! But there is a lot of Web Crawlers that don't follow Robots.txt Disallow Rules or totally ignore it ...

User-agent: *
Disallow: /login.aspx
Disallow: /search.aspx
Disallow: /error404.aspx

User-agent: Googlebot
Disallow: /Photos.aspx

So there is the way in order to block them using an HttpModule which will parse the Robots.txt (once) and verify on the AuthorizeRequest event of HttpApplication Context.

public class RobotHttpModule : IHttpModule
{
    private class RobotItem
    {
        public RobotItem()
        {
            DisallowRules = new List<String>();
        }

        public String UserAgent { get; set; }
        public List<String> DisallowRules { get; set; }
    }

    private static List<RobotItem> RobotsItems;

    #region IHttpModule Members

    public void Init(HttpApplication context)
    {
        RobotsItems = new List<RobotItem>();

        // Parse Robots.txt File (Only Once Time)
        String robotsFile = HttpContext.Current.Server.MapPath("~/Robots.txt");
        String[] robotsLines = File.ReadAllLines(robotsFile, Encoding.Default);

        RobotItem item = null;
        foreach (String s in robotsLines)
        {
            if (s.Contains("User-agent:"))
            {
                item = new RobotItem();
                String[] tmp = s.Split(':');
                item.UserAgent = tmp[1].TrimStart();
            }
            if (s.Contains("Disallow:"))
            {
                String[] tmp = s.Split(':');
                item.DisallowRules.Add(tmp[1].TrimStart());
            }
            if (!RobotsItems.Contains(item)
                && !String.IsNullOrEmpty(item.UserAgent)
                && item.DisallowRules.Count > 0)
            {
                RobotsItems.Add(item);
            }
        }

        context.AuthorizeRequest += new EventHandler(AuthorizeRequest);
    }
    public void Dispose()
    {
    }

    #endregion

    #region Private Methods

    private void AuthorizeRequest(object sender, EventArgs e)
    {
        HttpRequest Request = HttpContext.Current.Request;
        HttpResponse Response = HttpContext.Current.Response;

        String userAgent = Request.UserAgent.ToLowerInvariant();
        String calledUrl = Request.Url.OriginalString.ToLowerInvariant();

        List<String> urlToBlock = new List<String>();

        RobotItem matchItem = null;
        // Block All Agents (By * pattern)
        matchItem = RobotsItems.Where(p => p.UserAgent.ToLowerInvariant() == "*").SingleOrDefault();
        if (matchItem != null)
            urlToBlock.AddRange(matchItem.DisallowRules);

        // Block Specific UserAgent
        matchItem = RobotsItems.Where(p =>
            userAgent.Contains(p.UserAgent.ToLowerInvariant())).SingleOrDefault();
        if (matchItem != null)
            urlToBlock.AddRange(matchItem.DisallowRules);

        Boolean IsBlocked = false;
        foreach (String s in urlToBlock)
        {
            IsBlocked = calledUrl.Contains(s.ToLowerInvariant());
            if (IsBlocked == true)
                break;
        }

        if (IsBlocked)
        {
            Response.ClearContent();
            Response.Write("Access Denied");
            Response.End();
        }
    }

    #endregion
}

Web.config

<httpModules>
    <add name="RobotHttpModule" type="RobotModule.RobotHttpModule"/>
</httpModules>

 

 

Hope this Help's!

Download Solution - RobotModule.zip



Views(870)

kick it on DotNetKicks.com

Share/Save/Bookmark Subscribe

Be the first to rate this post

  • Currently 0/5 Stars.
  • 1
  • 2
  • 3
  • 4
  • 5

Tags: ,

ASP.NET | C#

Comments

Technorati Profile