tensorflow · marcospiau · Dec 26, 2021 · Dec 26, 2021 · Dec 26, 2021 · Jan 15, 2022
diff --git a/setup.py b/setup.py
@@ -146,6 +146,7 @@
     # In alphabetical order
     'aflw2k3d': ['scipy'],
     'beir': ['apache_beam'],
+    'brwac': ['ftfy'],
     'ble_wind_field': ['gcsfs', 'zarr'],
     'c4': ['apache_beam', 'gcld3', 'langdetect', 'nltk', 'tldextract'],
     'cats_vs_dogs': ['matplotlib'],

diff --git a/tensorflow_datasets/core/lazy_imports_lib.py b/tensorflow_datasets/core/lazy_imports_lib.py
@@ -77,6 +77,9 @@ def envlogger(cls):
 
   @utils.classproperty
   @classmethod
+  def ftfy(cls):
+    return _try_import("ftfy")
+
   def gcsfs_store(cls):
     return _try_import("gcsfs").GCSFileSystem(token='anon').get_mapper
 

diff --git a/tensorflow_datasets/core/lazy_imports_lib_test.py b/tensorflow_datasets/core/lazy_imports_lib_test.py
@@ -29,6 +29,7 @@ class LazyImportsTest(testing.TestCase, parameterized.TestCase):
   @parameterized.parameters(
       "bs4",
       "cv2",
+      "ftfy",
       "gcld3",
       "gcsfs_store",
       "langdetect",

diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -21,6 +21,7 @@
 from tensorflow_datasets.text.beir import Beir
 from tensorflow_datasets.text.blimp import Blimp
 from tensorflow_datasets.text.bool_q import BoolQ
+from tensorflow_datasets.text.brwac import Brwac
 from tensorflow_datasets.text.c4 import C4
 from tensorflow_datasets.text.cfq import CFQ
 from tensorflow_datasets.text.cfq import CFQConfig

diff --git a/tensorflow_datasets/text/brwac/__init__.py b/tensorflow_datasets/text/brwac/__init__.py
@@ -0,0 +1,3 @@
+"""brwac dataset."""
+
+from .brwac import Brwac
diff --git a/tensorflow_datasets/text/brwac/brwac.py b/tensorflow_datasets/text/brwac/brwac.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""brwac dataset."""
+
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+from tensorflow_datasets.text.brwac.brwac_utils import parse_vert_file
+
+_HOMEPAGE = 'https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC'
+
+
+# pylint: disable=line-too-long
+_DESCRIPTION = f"""\
+The Brazilian Portuguese Web as Corpus is a large corpus constructed following
+the Wacky framework, which was made public for research purposes.
+
+The current corpus version, released in January 2017, is composed by 3.53
+million documents and 2.68 billion tokens. In order to use this dataset, you
+must request access by filling the form in the [official homepage]({_HOMEPAGE}).
+
+Please note that this resource is available solely for academic research
+purposes, and you agreed not to use it for any commercial applications.
+
+
+Title and text fields are preprocessed using [ftfy](https://github.com/rspeer/python-ftfy) ([Speer, 2019](http://doi.org/10.5281/zenodo.2591652)) Python library.
+
+PS.: Description is extracted from [official homepage]({_HOMEPAGE}).
+"""
+
+_CITATION = """
+@inproceedings{wagner2018brwac,
+  title={The brWaC Corpus: A New Open Resource for Brazilian Portuguese},
+  author={{Wagner Filho}, Jorge A and Wilkens, Rodrigo and Idiart, Marco and Villavicencio, Aline},
+  booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+  year={2018}
+}
+"""
+
+
+class Brwac(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for brwac dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+  MANUAL_DOWNLOAD_INSTRUCTIONS = """\
+    Fill out the form at https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC
+    and wait for the email with download instructions. On the email, download
+    the file by clicking in the link with the description `1) Basic VERT corpus
+    file (4.45GB as .7z)`. Extract this file and place its output
+    (`brwac.vert`, approx. 22GB in size) in `manual_dir/`.
+  """
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict(
+            {
+              'doc_id': tfds.features.Text(),
+              'doc_idx': tf.int64,
+              'title': tfds.features.Text(),
+              'uri': tfds.features.Text(),
+              'text': tfds.features.Sequence({
+                'paragraphs':tfds.features.Sequence(tfds.features.Text())
+                }),
+            }
+        ),
+        supervised_keys=None,
+        homepage=_HOMEPAGE,
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    """Returns SplitGenerators."""
+    path = dl_manager.manual_dir
+
+    return {
+        'train': self._generate_examples(path / 'brwac.vert'),
+    }
+
+  def _generate_examples(self, path):
+    #   """Yields examples."""
+    for doc in parse_vert_file(path=path, show_progress=True):
+      yield doc['doc_idx'], {
+          k: doc[k] for k in ['doc_id', 'title', 'uri', 'text', 'doc_idx']
+      }
diff --git a/tensorflow_datasets/text/brwac/brwac_test.py b/tensorflow_datasets/text/brwac/brwac_test.py
@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""brwac dataset."""
+
+import tensorflow_datasets as tfds
+from . import brwac
+
+
+class BrwacTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for brwac dataset."""
+  DATASET_CLASS = brwac.Brwac
+  SPLITS = {'train': 20}
+
+  DL_EXTRACT_RESULT = {'train': 'brwac.vert'}
+
+
+if __name__ == '__main__':
+  tfds.testing.test_main()
diff --git a/tensorflow_datasets/text/brwac/brwac_utils.py b/tensorflow_datasets/text/brwac/brwac_utils.py
@@ -0,0 +1,81 @@
+# coding=utf-8
+# Copyright 2021 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for the brwac dataset."""
+
+import re
+from typing import Dict
+
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+from tqdm import tqdm
+
+ftfy = tfds.core.lazy_imports.ftfy
+
+paragraph_pattern = re.compile('<p> (.*?) </p>', flags=re.DOTALL)
+sentence_pattern = re.compile('<s> (.*?) </s>', flags=re.DOTALL)
+
+
+def extract_ids(line, fix_title=True):
+  matches = re.findall('<doc docid="(.*?)" title="(.*?)" uri="(.*?)">', line)
+  assert len(matches) == 1
+  doc_id, title, uri = matches[0]
+  if fix_title:
+    title = ftfy.fix_text(title)
+  return dict(doc_id=doc_id, title=title, uri=uri)
+
+
+def parse_single_doc(doc_string: str) -> Dict:
+  """Parses single brwac document"""
+  doc_header, doc_body = doc_string.split('\n', maxsplit=1)
+  doc_body = ' '.join(doc_body.replace('\n<g/>\n', '').split())
+  paragraphs = [
+      list(map(ftfy.fix_text, re.findall(sentence_pattern, sentences)))
+      for sentences in re.findall(paragraph_pattern, doc_body)
+  ]
+  return_dict = extract_ids(doc_header)
+  return_dict.update({'text': {'paragraphs': paragraphs}})
+  return return_dict
+
+
+def parse_vert_file(path: str, show_progress: bool = True):
+  """Parses brwac vert file.
+
+    Args:
+        path: path for file
+        show_progress (bool): if we should show a progress bar
+
+    Yields:
+        dict with a BrWac document contents
+    """
+  doc_buffer = ''
+  doc_count = 0
+  with tf.io.gfile.GFile(path, 'r') as fin:
+    pbar = tqdm(
+        fin,
+        desc=f'Parsing BrWac vert file {path}.',
+        disable=not (show_progress),
+        unit=' lines processed'
+    )
+    for line in pbar:
+      doc_buffer += line
+      if line == '</doc>\n':  # end of document
+        parsed_doc = parse_single_doc(doc_buffer)
+        parsed_doc['doc_idx'] = doc_count
+        yield parsed_doc
+        del parsed_doc
+        doc_buffer = ''
+        doc_count += 1
+        pbar.set_postfix(**{'Documents processed': doc_count})