using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Formats.Tar;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace HtmlParser
{
internal class Element
{
public Element()
{
}
private void GetTag(string line, string tagName)
{
var pattern = "<" + tagName + " (.*)>(.*)" + tagName + ">";
var regex = new Regex(pattern);
var match = regex.Match(line);
if (match.Success)
{
Console.WriteLine(match.Groups[0].Value);
string attr = GetAttr(match.Groups[0].Value, "href");
Console.WriteLine(attr);
attr = GetQueryString(attr, "genre_item");
Console.WriteLine(attr);
attr = GetAttr(match.Groups[0].Value, "class");
Console.WriteLine(attr);
string value = GetValue(match.Groups[0].Value, tagName);
Console.WriteLine(value);
}
}
public string GetTag1(string line, string tagName)
{
var rtn = string.Empty;
var pattern = "<" + tagName + " (.*)>(.*)" + tagName + ">";
var regex = new Regex(pattern);
var match = regex.Matches(line);
foreach (Match item in match)
{
Console.WriteLine(item.Value);
string attr = GetAttr(item.Value, "href");
Console.WriteLine(attr);
attr = GetQueryString(item.Value, "genre_item");
Console.WriteLine(attr);
attr = GetAttr(item.Value, "class");
Console.WriteLine(attr);
string value = GetValue(item.Value, tagName);
Console.WriteLine(value);
rtn = item.Value;
}
return rtn;
}
private string GetAttr(string str, string attrPattern)
{
attrPattern += "=\"";
var rtn = string.Empty;
int i = 0;
bool flag = true;
int max = attrPattern.Length;
StringBuilder sb = new StringBuilder();
bool attrFlag = false;
while (i < str.Length)
{
foreach (char c in attrPattern)
{
if (str[i] == c)
{
i++;
attrFlag = true;
}
else
{
attrFlag = false;
break;
}
}
if (attrFlag)
{
break;
}
i++;
}
if (attrFlag)
{
while (i < str.Length)
{
if (str[i] == '\"')
{
flag |= false;
i += 1;
break;
}
if (flag)
{
try
{
sb.Append(str[i]);
}
catch (Exception ex)
{
}
}
i++;
}
}
return sb.ToString();
}
private string GetValue(string str, string attrPattern)
{
var Pattern = "" + attrPattern + ">";
var rtn = string.Empty;
int i = 0;
bool flag = true;
StringBuilder sb = new StringBuilder();
bool attrFlag = false;
while (i < str.Length)
{
if (str[i] == '>')
{
attrFlag = true;
i++;
break;
}
i++;
}
if (attrFlag)
{
while (i < str.Length)
{
foreach (char c in Pattern)
{
if (str[i] == c)
{
i++;
flag = true;
}
else
{
flag = false;
break;
}
}
if (!flag)
{
try
{
sb.Append(str[i]);
}
catch (Exception ex)
{
}
}
i++;
}
}
return sb.ToString();
}
private string GetQueryString(string str, string attrPattern)
{
attrPattern += "=";
var rtn = string.Empty;
int i = 0;
bool flag = true;
int max = attrPattern.Length;
StringBuilder sb = new StringBuilder();
bool attrFlag = false;
while (i < str.Length)
{
foreach (char c in attrPattern)
{
try
{
if (str[i] == c)
{
i++;
attrFlag = true;
}
else
{
attrFlag = false;
break;
}
}
catch (Exception ex)
{
break;
}
}
if (attrFlag)
{
break;
}
i++;
}
if (attrFlag)
{
while (i < str.Length)
{
if (str[i] == '\"')
{
flag |= false;
i += 1;
break;
}
if (flag)
{
try
{
sb.Append(str[i]);
}
catch (Exception ex)
{
}
}
i++;
}
}
return sb.ToString();
}
}
internal class Parser
{
public Parser()
{
var html = string.Empty;
var filename = @"C:\path\to\dir\1.txt";
using (var sr = new StreamReader(filename, System.Text.Encoding.UTF8))
{
while (sr.Peek() > -1)
{
var line = sr.ReadLine();
var element = new Element();
var str = element.GetTag1(line, "a");
}
}
}
}
}