Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Visual Studio
.vs/
../.vs/
*.sln.ide/
*.suo
*.user
*.userosf
*.userprefs
*.ncb
*.opendb
*.opensdf
*.sdf
*.vssscc
*.vspscc
*.vsscc
*.bak
*.log
*.resharper
*.dotCover

# Build results
[Dd]ebug/
[Rr]elease/
x64/
x86/
ARM/
ARM64/
bin/
obj/
[Ll]og/

# Packages
packages/
*.nupkg
*.snupkg
*.dll
*.exe
*.pdb
*.cache
*.xml
*.json # Be careful with this, if your solution relies on JSON data files, they should not be ignored.

# Rider
.idea/

# Test Results
TestResults/
*.trx

# Temporary files
temp/
tmp/

# User-specific files
*.settings
*.local
*.VC.db

# Visual Studio Code
.vscode/

# EnglishData folder (dataset)
EnglishData/
25 changes: 25 additions & 0 deletions Phase02/Phase02/Phase02.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36310.24 d17.14
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Phase02", "Phase02\Phase02.csproj", "{2A0FB1B8-0196-4672-94BB-F22D839E1D2F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{2A0FB1B8-0196-4672-94BB-F22D839E1D2F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2A0FB1B8-0196-4672-94BB-F22D839E1D2F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2A0FB1B8-0196-4672-94BB-F22D839E1D2F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2A0FB1B8-0196-4672-94BB-F22D839E1D2F}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {EB0409C8-9820-4D34-A908-57EA4BB14E9B}
EndGlobalSection
EndGlobal
18 changes: 18 additions & 0 deletions Phase02/Phase02/Phase02/Core/FileReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// File: Core/FileReader.cs
using System.IO;

namespace Phase02.Core
{
public static class FileReader
{
public static string[] ReadAllFileNames(string folderPath)
{
if (!Directory.Exists(folderPath))
{
return new string[0];
}

return Directory.GetFiles(folderPath);
}
}
}
91 changes: 91 additions & 0 deletions Phase02/Phase02/Phase02/Core/InvertedIndex.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// File: Core/InvertedIndex.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;

namespace Phase02.Core
{
public class InvertedIndex
{
private readonly Dictionary<string, HashSet<string>> _index = new Dictionary<string, HashSet<string>>();

private static string Normalize(string text)
{
var noPunct = Regex.Replace(text, @"[^\w\s]", " ");
var singleSpaced = Regex.Replace(noPunct, @"\s+", " ").Trim();
return singleSpaced.ToUpperInvariant();
}

private static IEnumerable<string> Tokenize(string text)
{
var normalized = Normalize(text);
return normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries);
}

public void AddDocument(string documentPath)
{
string content;
try
{
content = File.ReadAllText(documentPath);
}
catch (IOException)
{
return;
}

var tokens = Tokenize(content);

foreach (var token in tokens)
{
if (!_index.TryGetValue(token, out var docSet))
{
docSet = new HashSet<string>();
_index[token] = docSet;
}
docSet.Add(documentPath);
}
}

public IEnumerable<string> Search(string token)
{
var key = token.ToUpperInvariant();
if (_index.TryGetValue(key, out var docSet))
return docSet;
return Enumerable.Empty<string>();
}

public IEnumerable<string> SmartSearch(
IEnumerable<string> mustInclude,
IEnumerable<string> atLeastOne,
IEnumerable<string> mustExclude)
{
IEnumerable<string> result = null;

foreach (var word in mustInclude)
{
var docs = Search(word);
result = result == null ? docs : result.Intersect(docs);
}

if (result == null)
result = _index.Values.SelectMany(s => s).Distinct();

if (atLeastOne.Any())
{
var orSet = atLeastOne.SelectMany(w => Search(w)).ToHashSet();
result = result.Intersect(orSet);
}

if (mustExclude.Any())
{
var notSet = mustExclude.SelectMany(w => Search(w)).ToHashSet();
result = result.Except(notSet);
}

return result;
}
}
}
15 changes: 15 additions & 0 deletions Phase02/Phase02/Phase02/Phase02.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="dotnet-stop-words" Version="1.1.0" />
<PackageReference Include="Porter2Stemmer" Version="1.0.0" />
</ItemGroup>

</Project>
47 changes: 47 additions & 0 deletions Phase02/Phase02/Phase02/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// File: Program.cs
using System;
using System.Collections.Generic;
using System.IO;
using Phase02.Core;

namespace Phase02
{
internal class Program
{
static void Main(string[] args)
{
var baseDir = AppContext.BaseDirectory;
// baseDir = ...\bin\Debug\net8.0\
var projectRoot = Path.GetFullPath(Path.Combine(baseDir, "..", "..", ".."));
var dataDir = Path.Combine(projectRoot, "EnglishData");
var files = FileReader.ReadAllFileNames(dataDir);
var index = new InvertedIndex();

foreach (var file in files)
index.AddDocument(file);

Console.Write("Enter query: ");
var line = Console.ReadLine() ?? "";

var mustInclude = new List<string>();
var atLeastOne = new List<string>();
var mustExclude = new List<string>();

foreach (var tok in line.Split(' ', StringSplitOptions.RemoveEmptyEntries))
{
if (tok.StartsWith("+"))
atLeastOne.Add(tok.Substring(1));
else if (tok.StartsWith("-"))
mustExclude.Add(tok.Substring(1));
else
mustInclude.Add(tok);
}

var results = index.SmartSearch(mustInclude, atLeastOne, mustExclude);

Console.WriteLine("\nSearch results:");
foreach (var doc in results)
Console.WriteLine(Path.GetFileName(doc));
}
}
}