Bill Butler's answer makes a good point.
Do the file sizes vary wildly, or are they approximatly the same over the
sample size? This could account for differences. Also, look at your
process. If some files have more replacements than others, then the work
being done in each 10 seconds is not properly counted by file count.
I understand. but the size in bytes of these files are uniformly
distributed...
It's not like, files from 20000 - 30000 are larger or something like
that...
Is it like, as time progresses, the number of Garbage collection calls
are higher and because of that overhead my performance is hampered
over time?
Possible, but I wouldn't expect that to be the problem.
Why wouldn't this be a problem?
This is the complete source code..
But dataset is huge.. which I cannot upload...
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using SearchEngine.PorterStemmerAlgorithm;
using System.Collections;
using System.Xml;
namespace SearchEngine
{
class Program
{
static private void GetDirectoryList(DirectoryInfo di, string
searchPattern, ref ArrayList directoryList)
{
foreach (DirectoryInfo d in di.GetDirectories())
{
directoryList.Add(d.FullName);
GetDirectoryList(d, searchPattern, ref directoryList);
}
}
static void Main(string[] args)
{
const int INCRTIME = 1;
Console.WriteLine("Starting Crawler... (removed rep)");
Searcher s = new Searcher();
s.ReadStopWords("stopwords.txt");
if (args.Length != 1)
{
Console.WriteLine("Type a dir name...!");
return;
}
StringBuilder sbFileContents = new StringBuilder();
DateTime start = DateTime.Now;
string searchPattern = "*.txt";
string InitDirectory = args[0];
ArrayList directoryList = new ArrayList();
DirectoryInfo di = new DirectoryInfo(InitDirectory);
GetDirectoryList(di, searchPattern, ref directoryList);
Console.WriteLine("Total Folders: " +
directoryList.Count);
Console.WriteLine("(Time\tFiles Processed)");
int count = 0;
int prev = INCRTIME;
foreach (string directory in directoryList)
{
DirectoryInfo cdi = new DirectoryInfo(directory);
DateTime fileStart, end;
foreach (FileInfo fi in cdi.GetFiles(searchPattern))
{
try
{
fileStart = DateTime.Now;
sbFileContents.Remove(0,
sbFileContents.Length);
StreamReader rd = File.OpenText(fi.FullName);
string fileContent = rd.ReadToEnd();
fileContent = fileContent.Replace("&",
"&");
fileContent = fileContent.Replace("&",
"&");
XmlDocument xdoc = new XmlDocument();
xdoc.LoadXml(fileContent);
String docNo = xdoc.SelectSingleNode("//
DOCNO").InnerText;
String docType = xdoc.SelectSingleNode("//
DOCTYPE").InnerText;
String txtType = xdoc.SelectSingleNode("//
TXTTYPE").InnerText;
String text = xdoc.SelectSingleNode("//
TEXT").InnerText;
sbFileContents.Append(text);
s.StripPunctuation(ref sbFileContents);
string[] tokenizedArray = s.Tokenize(ref
sbFileContents, true);
//s.ConvertToWordID(ref tokenizedArray);
//s.Vectorize(docNo, docType, txtType,
tokenizedArray); some thing like this? Mugunth
count++;
end = DateTime.Now;
double fulldiff = (end -
start).TotalMilliseconds;
double diff = (end -
fileStart).TotalMilliseconds;
Console.WriteLine("Time taken for this file
(ms):" + diff.ToString());
if (fulldiff / 1000 > prev)
{
Console.WriteLine((((int)(fulldiff/
1000)).ToString()) + "\t" +
count.ToString());
prev += INCRTIME;
//System.GC.Collect();
// test code
}
rd.Close();
}
catch (Exception ex)
{
Console.WriteLine(fi.FullName + " - " +
ex.Message);
return;
}
}
}
DateTime end1 = DateTime.Now;
double finaldiff = (end1 - start).TotalMilliseconds;
Console.WriteLine("Done - " + count + " files found!");
Console.WriteLine("Time taken so far (sec):" + ((int)
(finaldiff/1000)).ToString());
Console.ReadKey();
}
}
public class Searcher
{
private List<string> stopwords = new List<string>();
public void ReadStopWords(string stopWordsFile)
{
TextReader tr;
try
{
tr = new StreamReader(stopWordsFile);
}
catch (System.IO.IOException /*ioe*/)
{
return;
}
while (true)
{
string str;
str = tr.ReadLine();
if (str == null)
break;
stopwords.Add(str);
}
}
public string[] Tokenize(ref StringBuilder fileContents, bool
bStem)
{
string[] returnArray;
string[] delimiters = { " ", "?", ". " };
int count = 0;
string[] strArray = fileContents.ToString().
Split(delimiters,
StringSplitOptions.RemoveEmptyEntries);
returnArray = new string[strArray.Length];
PorterStemmer ps = new PorterStemmer();
foreach (String str in strArray)
{
string word;
if (bStem)
{
word = ps.stemTerm(str);
}
else
{
word = str;
}
if(!IsStopWord(word))
returnArray[count++] = word;
}
return returnArray;
}
private bool IsStopWord(string word)
{
foreach (string str in stopwords)
{
if (str.Equals(word,
StringComparison.OrdinalIgnoreCase))
return true;
}
return false; // not stop word
}
public void StripPunctuation(ref StringBuilder sbFileContents)
{
char[] punctuations = { '#', '!', '*', '-', '"', ','};
int len = sbFileContents.Length;
for (int i = 0 ; i < len; i ++)
{
if (sbFileContents
.CompareTo(punctuations[0]) ==
0||
sbFileContents.CompareTo(punctuations[1]) == 0
||
sbFileContents.CompareTo(punctuations[2]) == 0
||
sbFileContents.CompareTo(punctuations[3]) == 0
||
sbFileContents.CompareTo(punctuations[4]) == 0
||
sbFileContents.CompareTo(punctuations[5]) ==
0)
{
sbFileContents = ' ';
}
}
/*
char[] punctuations = { '#', '!', '*', '-', '"', ',' };
foreach (char ch in punctuations)
{
sbFileContents = sbFileContents.Replace(ch, ' ');
}*/
}
public void ConvertToWordID(ref string[] tokenizedArray)
{
// just displays the words now... do some processing
here...
// Mugunth_Dummy_Code
foreach (String str in tokenizedArray)
{
Console.WriteLine(str);
}
}
}
}
Mugunth