tensorflow · acharles7 · May 12, 2020 · May 13, 2020 · May 13, 2020 · Jun 10, 2020
diff --git a/tensorflow_datasets/structured/__init__.py b/tensorflow_datasets/structured/__init__.py
@@ -19,6 +19,7 @@
 from tensorflow_datasets.structured.amazon_us_reviews import AmazonUSReviews
 from tensorflow_datasets.structured.forest_fires import ForestFires
 from tensorflow_datasets.structured.german_credit_numeric import GermanCreditNumeric
+from tensorflow_datasets.structured.heart_disease import HeartDisease
 from tensorflow_datasets.structured.higgs import Higgs
 from tensorflow_datasets.structured.iris import Iris
 from tensorflow_datasets.structured.rock_you import RockYou

diff --git a/tensorflow_datasets/structured/heart_disease.py b/tensorflow_datasets/structured/heart_disease.py
@@ -0,0 +1,85 @@
+"""heart_disease dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """\
+@misc{Dua:2019 ,
+author = "Janosi, Steinbrunn and Pfisterer, Detrano",
+year = "1988",
+title = "{UCI} Machine Learning Repository",
+url = "http://archive.ics.uci.edu/ml/datasets/Heart+Disease",
+institution = "University of California, Irvine, School of Information and Computer Sciences"
+}
+"""
+
+_DESCRIPTION = """\
+This data set contain 13 attributes and labels of heart disease from \
+303 participants from Cleveland since Cleveland data was most commonly\
+used in modern research.
+
+Attribute by column index
+1. age      : age in years
+2. sex      : sex (1 = male; 0 = female)
+3. cp       : chest pain type
+    (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
+4. trestbps : resting blood pressure (in mm Hg on admission to the hospital)
+5. chol     : serum cholestoral in mg/dl
+6. fbs      : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
+7. restecg  : resting electrocardiographic results
+8. thalach  : maximum heart rate achieved
+9. exang    : exercise induced angina (1 = yes; 0 = no)
+10. oldpeak : ST depression induced by exercise relative to rest
+11. slope   : the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
+12. ca      : number of major vessels (0-3) colored by flourosopy
+13. thal    : 3 = normal; 6 = fixed defect; 7 = reversable defect
+14. num (the predicted attribute): diagnosis of heart disease (angiographic disease status)
+    (0 = < 50% diameter narrowing, no presence of heart disease;
+     1 = > 50% diameter narrowing, with increasing severity)
+Dataset Homepage: http://archive.ics.uci.edu/ml/datasets/Heart+Disease
+"""
+
+class HeartDisease(tfds.core.GeneratorBasedBuilder):
+  """Heart disease dataset with 13 attributes."""
+
+  VERSION = tfds.core.Version("0.0.1", "New split API (https://tensorflow.org/datasets/splits)")
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "features": tfds.features.Tensor(shape=(13,), dtype=tf.float32),
+            "label": tfds.features.ClassLabel(names=['0', '1', '2', '3', '4'])
+        }),
+        supervised_keys=("features", "label"),
+        homepage='http://archive.ics.uci.edu/ml/datasets/Heart+Disease',
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+
+    filepath = dl_manager.download('http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data')
+    all_lines = tf.io.gfile.GFile(filepath).read().split("\n")
+    records = [l for l in all_lines if ('?' not in l) and l]
+    # There is no predefined train/val/test split for this dataset.
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={"records": records}
+            ),
+        ]
+
+  def _generate_examples(self, records):
+    """Yields examples."""
+    for i, row in enumerate(records):
+      features = row.split(',')
+      yield i, {
+          "features": [float(feature) for feature in features[:-1]],
+          "label": features[-1]
+      }
diff --git a/tensorflow_datasets/structured/heart_disease_test.py b/tensorflow_datasets/structured/heart_disease_test.py
@@ -0,0 +1,19 @@
+"""heart_disease dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow_datasets.public_api as tfds
+from tensorflow_datasets.structured import heart_disease
+
+class HeartDiseaseTest(tfds.testing.DatasetBuilderTestCase):
+  """test for heart disease dataset"""
+  DATASET_CLASS = heart_disease.HeartDisease
+  SPLITS = {
+      "train": 1,  # Number of fake train example
+  }
+  DL_EXTRACT_RESULT = 'processed.cleveland.data'
+
+if __name__ == "__main__":
+  tfds.testing.test_main()
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/heart_disease/processed.cleveland.data b/tensorflow_datasets/testing/test_data/fake_examples/heart_disease/processed.cleveland.data
@@ -0,0 +1 @@
+63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
diff --git a/tensorflow_datasets/url_checksums/heart_disease.txt b/tensorflow_datasets/url_checksums/heart_disease.txt
@@ -0,0 +1 @@
+http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data 18461 a74b7efa387bc9d108d7d0115d831fe9b414b29ae7124f331b622b4efa0427c8
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data 18461 a74b7efa387bc9d108d7d0115d831fe9b414b29ae7124f331b622b4efa0427c8