From 591133c698091ff17714780b4e08ac93e66164df Mon Sep 17 00:00:00 2001 From: dhuck Date: Wed, 15 Mar 2023 17:34:50 -0500 Subject: [PATCH 1/2] added haskell parsing modelled after C parser --- comment_parser/parsers/haskell_parser.py | 65 +++++++++++++ .../parsers/tests/haskell_parser_test.py | 97 +++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 comment_parser/parsers/haskell_parser.py create mode 100644 comment_parser/parsers/tests/haskell_parser_test.py diff --git a/comment_parser/parsers/haskell_parser.py b/comment_parser/parsers/haskell_parser.py new file mode 100644 index 0000000..cd53898 --- /dev/null +++ b/comment_parser/parsers/haskell_parser.py @@ -0,0 +1,65 @@ +#!/usr/bin/python +"""This module provides methods for parsing comments from Haskell source code""" + +import re +from bisect import bisect_left +from typing import List +from comment_parser.parsers import common + + +def extract_comments(code: str) -> List[common.Comment]: + """Extracts a list of comments from the given Haskell source code + + Comments are represented with the Comment class found in the common module. + Haskell comments come in two forms, single and multi-line comments. + - Single line comments begin with `--` and continue until the end of the + line + - Multi-line comments begin with `{-` and end with `-}` and can span an + arbitrary number of lines of code. If the multi-line comment does not + terminate before EOF is reached, then an exception is raised. + + + More information: https://wiki.haskell.org/Language_and_library_specification + + Args: + code (str): String containing code to extract comments from. + Returns: + List[common.Comment]: Python list of common.Comment in the order that + they appear in the code + Raises: + common.UnterminatedCommentError: Encountered an unterminated multi-line + comment + """ + + pattern = r""" + (?P (\"([^\"\n])*\")+) | + (?P \-\-(?P.*)?$) | + (?P {\-(?P(.|\n)*?)?\-}) | + (?P {\-(.*)?) + """ + + compiled = re.compile(pattern, re.VERBOSE | re.MULTILINE) + + lines_indexes = [] + for match in re.finditer(r"$", code, re.M): + lines_indexes.append(match.start()) + + comments = [] + for match in compiled.finditer(code): + kind = match.lastgroup + + start_character = match.start() + line_no = bisect_left(lines_indexes, start_character) + + if kind == "single": + comment_content = match.group("single_content") + comment = common.Comment(comment_content, line_no + 1) + comments.append(comment) + elif kind == "multi": + comment_content = match.group("multi_content") + comment = common.Comment(comment_content, line_no + 1, multiline=True) + comments.append(comment) + elif kind == "error": + raise common.UnterminatedCommentError() + + return comments \ No newline at end of file diff --git a/comment_parser/parsers/tests/haskell_parser_test.py b/comment_parser/parsers/tests/haskell_parser_test.py new file mode 100644 index 0000000..aedc096 --- /dev/null +++ b/comment_parser/parsers/tests/haskell_parser_test.py @@ -0,0 +1,97 @@ +#!/usr/bin/python +"""Tests for comment_parser.parsers.haskell_parser.py""" + +import unittest +from comment_parser.parsers import common +from comment_parser.parsers import haskell_parser + + +class HaskellParserTest(unittest.TestCase): + + def testSimpleMain(self): + code = "-- this is a comment\nmodule main where\nmain = putStrLn \"Hello, World!\"" + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(code[2:20], 1, multiline=False)] + self.assertEqual(comments, expected) + + def testSingleLineComment(self): + code = "-- single line comment" + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(code[2:], 1, multiline=False)] + self.assertEqual(comments, expected) + + def testSingleLineCommentInStringLiteral(self): + code = 'a = "-- this is not a comment"' + comments = haskell_parser.extract_comments(code) + self.assertEqual(comments, []) + + def testMultiLineComment(self): + code = '{- multiline\ncomment -}' + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(code[2:-2], 1, multiline=True)] + self.assertEqual(comments, expected) + + def testMultiLineCommentsWithDashes(self): + code = "{----------------------}" + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(code[2:-2], 1, multiline=True)] + self.assertEqual(comments, expected) + + def testMultilineCommentInStringLiteral(self): + code = 'a = "{- this is not a comment -}"' + comments = haskell_parser.extract_comments(code) + self.assertEqual(comments, []) + + def testMultiLineCommentUnterminated(self): + code = 'int a = 1; {- Unterminated\\n comment' + self.assertRaises(common.UnterminatedCommentError, + haskell_parser.extract_comments, code) + + def testMultipleMultilineComments(self): + code = '{- abc -} {- 123 -}' + expected = [ + common.Comment(' abc ', 1, multiline=True), + common.Comment(' 123 ', 1, multiline=True) + ] + comments = haskell_parser.extract_comments(code) + self.assertEqual(comments, expected) + + def tetStringThenComment(self): + code = r'"" {- "abc -}' + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(' "abc ', 1, multiline=True)] + self.assertEqual(comments, expected) + + def testStringEscapedBackslashCharacter(self): + code = r'"\\"' + comments = haskell_parser.extract_comments(code) + self.assertEqual(comments, []) + + def testTwoStringsFollowedByComment(self): + code = r'"""" -- foo' + comments = haskell_parser.extract_comments(code) + expected = [common.Comment(' foo', 1)] + self.assertEqual(comments, expected) + + def testCommentedMultilineComment(self): + code = '''-- What if i start a {- here + int main(){return 0;} + -- and ended it here -}''' + comments = haskell_parser.extract_comments(code) + expected = [ + common.Comment(" What if i start a {- here", 1, False), + common.Comment(" and ended it here -}", 3, False) + ] + self.assertEqual(comments, expected) + + def testMultilineCommentedComment(self): + code = '''{--- here + int main(){return 0;} + -}-- and ended it here -}''' + comments = haskell_parser.extract_comments(code) + expected = [ + common.Comment('-- here\n int main(){return 0;}\n ', 1, + True), + common.Comment(' and ended it here -}', 3, False) + ] + self.assertEqual(comments, expected) \ No newline at end of file From 0ed99c10fb2fd033fe38189136572cdb651ec1e3 Mon Sep 17 00:00:00 2001 From: dhuck Date: Wed, 15 Mar 2023 20:16:56 -0500 Subject: [PATCH 2/2] add haskell to parser --- comment_parser/comment_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comment_parser/comment_parser.py b/comment_parser/comment_parser.py index 25c9b44..7ce5069 100755 --- a/comment_parser/comment_parser.py +++ b/comment_parser/comment_parser.py @@ -33,6 +33,7 @@ from comment_parser.parsers import python_parser from comment_parser.parsers import ruby_parser from comment_parser.parsers import shell_parser +from comment_parser.parsers import haskell_parser MIME_MAP = { 'application/javascript': js_parser, # Javascript @@ -48,6 +49,7 @@ 'text/x-script.python': python_parser, # Python 'text/x-shellscript': shell_parser, # Unix shell 'text/xml': html_parser, # XML + 'text/x-haskell': haskell_parser, # haskell_parser }