+ Sitemap generator I created while learning the dispose pattern + Testing project for learning general C#
		
			
				
	
	
		
			115 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
using System.Text.RegularExpressions;
 | 
						|
 | 
						|
namespace SiteMapLibrary;
 | 
						|
 | 
						|
public class SiteMap : IDisposable
 | 
						|
{
 | 
						|
  private HttpClient _client;
 | 
						|
  private HashSet<string> _foundUrls;
 | 
						|
  private HashSet<string> _visitedUrls;
 | 
						|
  private Queue<string> _visitQueue;
 | 
						|
  private bool _disposed = false;
 | 
						|
  private XmlManager XmlManager { get; set; }
 | 
						|
  public string? Url { get; private set; }
 | 
						|
  public Regex Regexp { get; set; }
 | 
						|
 | 
						|
  public SiteMap(string url, string savepath, Regex pattern)
 | 
						|
  {
 | 
						|
    Url = url;
 | 
						|
    _client = new HttpClient();
 | 
						|
    _foundUrls = new HashSet<string>();
 | 
						|
    _visitedUrls = new HashSet<string>();
 | 
						|
    _visitQueue = new Queue<string>();
 | 
						|
    Regexp = pattern;
 | 
						|
    XmlManager = new XmlManager(savepath);
 | 
						|
  }
 | 
						|
 | 
						|
  public SiteMap(string url, XmlManager mgr, Regex pattern)
 | 
						|
  {
 | 
						|
    _client = new HttpClient();
 | 
						|
    _foundUrls = new HashSet<string>();
 | 
						|
    _visitedUrls = new HashSet<string>();
 | 
						|
    _visitQueue = new Queue<string>();
 | 
						|
    Regexp = pattern;
 | 
						|
    Url = url;
 | 
						|
    XmlManager = mgr;
 | 
						|
  }
 | 
						|
 | 
						|
  public async Task Crawl()
 | 
						|
  {
 | 
						|
    while (Url != null)
 | 
						|
    {
 | 
						|
      _visitedUrls.Add(Url);
 | 
						|
      using var content = await _client.GetAsync(Url);
 | 
						|
      if (!content.IsSuccessStatusCode)
 | 
						|
      {
 | 
						|
        Console.WriteLine($"{content.StatusCode} on url: {Url}");
 | 
						|
        NextUrl();
 | 
						|
        continue;
 | 
						|
      }
 | 
						|
 | 
						|
      var m = Regexp.Match(await content.Content.ReadAsStringAsync());
 | 
						|
      while (m.Success)
 | 
						|
      {
 | 
						|
        foreach (Group group in m.Groups)
 | 
						|
        {
 | 
						|
          if (_foundUrls.Add(group.Value))
 | 
						|
          {
 | 
						|
            Console.WriteLine(group.Value);
 | 
						|
            // Console.WriteLine(content.Content.Headers.LastModified);
 | 
						|
            if (!_visitedUrls.Contains(group.Value) && !_visitQueue.Contains(group.Value))
 | 
						|
            {
 | 
						|
              _visitQueue.Enqueue(group.Value);
 | 
						|
            }
 | 
						|
          }
 | 
						|
        }
 | 
						|
 | 
						|
        m = m.NextMatch();
 | 
						|
      }
 | 
						|
 | 
						|
      NextUrl();
 | 
						|
      content.Dispose();
 | 
						|
    }
 | 
						|
    WriteXml();
 | 
						|
  }
 | 
						|
 | 
						|
  private void WriteXml()
 | 
						|
  {
 | 
						|
    List<string> urls = new List<string>(_visitedUrls.OrderBy(k => k.Length).ToArray());
 | 
						|
    foreach (string url in urls)
 | 
						|
    {
 | 
						|
      XmlManager.AddUrl(url);
 | 
						|
    }
 | 
						|
    XmlManager.Save();
 | 
						|
  }
 | 
						|
 | 
						|
  private void NextUrl()
 | 
						|
  {
 | 
						|
    if (_visitQueue.Count == 0)
 | 
						|
    {
 | 
						|
      Url = null;
 | 
						|
      return;
 | 
						|
    }
 | 
						|
    Url = _visitQueue.Dequeue();
 | 
						|
  }
 | 
						|
 | 
						|
  public void Dispose()
 | 
						|
  {
 | 
						|
    Dispose(true);
 | 
						|
    GC.SuppressFinalize(this);
 | 
						|
  }
 | 
						|
 | 
						|
  public void Dispose(bool disposing)
 | 
						|
  {
 | 
						|
    if (!_disposed)
 | 
						|
    {
 | 
						|
      if (disposing)
 | 
						|
      {
 | 
						|
        _client.Dispose();
 | 
						|
      }
 | 
						|
 | 
						|
      _disposed = true;
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 |