Skip to content

Added BrWac dataset #3880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
# In alphabetical order
'aflw2k3d': ['scipy'],
'beir': ['apache_beam'],
'brwac': ['ftfy'],
'ble_wind_field': ['gcsfs', 'zarr'],
'c4': ['apache_beam', 'gcld3', 'langdetect', 'nltk', 'tldextract'],
'cats_vs_dogs': ['matplotlib'],
Expand Down
3 changes: 3 additions & 0 deletions tensorflow_datasets/core/lazy_imports_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ def envlogger(cls):

@utils.classproperty
@classmethod
def ftfy(cls):
return _try_import("ftfy")

def gcsfs_store(cls):
return _try_import("gcsfs").GCSFileSystem(token='anon').get_mapper

Expand Down
1 change: 1 addition & 0 deletions tensorflow_datasets/core/lazy_imports_lib_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class LazyImportsTest(testing.TestCase, parameterized.TestCase):
@parameterized.parameters(
"bs4",
"cv2",
"ftfy",
"gcld3",
"gcsfs_store",
"langdetect",
Expand Down
1 change: 1 addition & 0 deletions tensorflow_datasets/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from tensorflow_datasets.text.beir import Beir
from tensorflow_datasets.text.blimp import Blimp
from tensorflow_datasets.text.bool_q import BoolQ
from tensorflow_datasets.text.brwac import Brwac
from tensorflow_datasets.text.c4 import C4
from tensorflow_datasets.text.cfq import CFQ
from tensorflow_datasets.text.cfq import CFQConfig
Expand Down
3 changes: 3 additions & 0 deletions tensorflow_datasets/text/brwac/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""brwac dataset."""

from .brwac import Brwac
101 changes: 101 additions & 0 deletions tensorflow_datasets/text/brwac/brwac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""brwac dataset."""

import tensorflow as tf
import tensorflow_datasets.public_api as tfds
from tensorflow_datasets.text.brwac.brwac_utils import parse_vert_file

_HOMEPAGE = 'https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC'


# pylint: disable=line-too-long
_DESCRIPTION = f"""\
The Brazilian Portuguese Web as Corpus is a large corpus constructed following
the Wacky framework, which was made public for research purposes.

The current corpus version, released in January 2017, is composed by 3.53
million documents and 2.68 billion tokens. In order to use this dataset, you
must request access by filling the form in the [official homepage]({_HOMEPAGE}).

Please note that this resource is available solely for academic research
purposes, and you agreed not to use it for any commercial applications.


Title and text fields are preprocessed using [ftfy](https://github.com/rspeer/python-ftfy) ([Speer, 2019](http://doi.org/10.5281/zenodo.2591652)) Python library.

PS.: Description is extracted from [official homepage]({_HOMEPAGE}).
"""

_CITATION = """
@inproceedings{wagner2018brwac,
title={The brWaC Corpus: A New Open Resource for Brazilian Portuguese},
author={{Wagner Filho}, Jorge A and Wilkens, Rodrigo and Idiart, Marco and Villavicencio, Aline},
booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
"""


class Brwac(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for brwac dataset."""

VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
Fill out the form at https://www.inf.ufrgs.br/pln/wiki/index.php?title=BrWaC
and wait for the email with download instructions. On the email, download
the file by clicking in the link with the description `1) Basic VERT corpus
file (4.45GB as .7z)`. Extract this file and place its output
(`brwac.vert`, approx. 22GB in size) in `manual_dir/`.
"""

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
'doc_id': tfds.features.Text(),
'doc_idx': tf.int64,
'title': tfds.features.Text(),
'uri': tfds.features.Text(),
'text': tfds.features.Sequence({
'paragraphs':tfds.features.Sequence(tfds.features.Text())
}),
}
),
supervised_keys=None,
homepage=_HOMEPAGE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
path = dl_manager.manual_dir

return {
'train': self._generate_examples(path / 'brwac.vert'),
}

def _generate_examples(self, path):
# """Yields examples."""
for doc in parse_vert_file(path=path, show_progress=True):
yield doc['doc_idx'], {
k: doc[k] for k in ['doc_id', 'title', 'uri', 'text', 'doc_idx']
}
30 changes: 30 additions & 0 deletions tensorflow_datasets/text/brwac/brwac_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""brwac dataset."""

import tensorflow_datasets as tfds
from . import brwac


class BrwacTest(tfds.testing.DatasetBuilderTestCase):
"""Tests for brwac dataset."""
DATASET_CLASS = brwac.Brwac
SPLITS = {'train': 20}

DL_EXTRACT_RESULT = {'train': 'brwac.vert'}


if __name__ == '__main__':
tfds.testing.test_main()
81 changes: 81 additions & 0 deletions tensorflow_datasets/text/brwac/brwac_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for the brwac dataset."""

import re
from typing import Dict

import tensorflow as tf
import tensorflow_datasets.public_api as tfds
from tqdm import tqdm

ftfy = tfds.core.lazy_imports.ftfy

paragraph_pattern = re.compile('<p> (.*?) </p>', flags=re.DOTALL)
sentence_pattern = re.compile('<s> (.*?) </s>', flags=re.DOTALL)


def extract_ids(line, fix_title=True):
matches = re.findall('<doc docid="(.*?)" title="(.*?)" uri="(.*?)">', line)
assert len(matches) == 1
doc_id, title, uri = matches[0]
if fix_title:
title = ftfy.fix_text(title)
return dict(doc_id=doc_id, title=title, uri=uri)


def parse_single_doc(doc_string: str) -> Dict:
"""Parses single brwac document"""
doc_header, doc_body = doc_string.split('\n', maxsplit=1)
doc_body = ' '.join(doc_body.replace('\n<g/>\n', '').split())
paragraphs = [
list(map(ftfy.fix_text, re.findall(sentence_pattern, sentences)))
for sentences in re.findall(paragraph_pattern, doc_body)
]
return_dict = extract_ids(doc_header)
return_dict.update({'text': {'paragraphs': paragraphs}})
return return_dict


def parse_vert_file(path: str, show_progress: bool = True):
"""Parses brwac vert file.

Args:
path: path for file
show_progress (bool): if we should show a progress bar

Yields:
dict with a BrWac document contents
"""
doc_buffer = ''
doc_count = 0
with tf.io.gfile.GFile(path, 'r') as fin:
pbar = tqdm(
fin,
desc=f'Parsing BrWac vert file {path}.',
disable=not (show_progress),
unit=' lines processed'
)
for line in pbar:
doc_buffer += line
if line == '</doc>\n': # end of document
parsed_doc = parse_single_doc(doc_buffer)
parsed_doc['doc_idx'] = doc_count
yield parsed_doc
del parsed_doc
doc_buffer = ''
doc_count += 1
pbar.set_postfix(**{'Documents processed': doc_count})
Loading