feat: Create a backend to transform PubMed XML files to DoclingDocument (#557)

Signed-off-by: lucas-morin <lucas.morin222@gmail.com>
This commit is contained in:
Lucas Morin 2024-12-17 19:27:09 +01:00 committed by GitHub
parent e31f09f71f
commit fd034802b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 31040 additions and 4 deletions

View File

@ -0,0 +1,592 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Any, Set, Union
import lxml
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class Paragraph(TypedDict):
text: str
headers: list[str]
class Author(TypedDict):
name: str
affiliation_names: list[str]
class Table(TypedDict):
label: str
caption: str
content: str
class FigureCaption(TypedDict):
label: str
caption: str
class Reference(TypedDict):
author_names: str
title: str
journal: str
year: str
class XMLComponents(TypedDict):
title: str
authors: list[Author]
abstract: str
paragraphs: list[Paragraph]
tables: list[Table]
figure_captions: list[FigureCaption]
references: list[Reference]
class PubMedDocumentBackend(DeclarativeDocumentBackend):
"""
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979
"""
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
# Initialize parents for the document hierarchy
self.parents: dict = {}
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.seek(0)
self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
self.valid = True
except Exception as exc:
raise RuntimeError(
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
) from exc
@override
def is_valid(self) -> bool:
return self.valid
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.XML_PUBMED}
@override
def convert(self) -> DoclingDocument:
# Create empty document
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="application/xml",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
_log.debug("Trying to convert PubMed XML document...")
# Get parsed XML components
xml_components: XMLComponents = self._parse()
# Add XML components to the document
doc = self._populate_document(doc, xml_components)
return doc
def _parse_title(self) -> str:
title: str = " ".join(
[
t.replace("\n", "")
for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
]
)
return title
def _parse_authors(self) -> list[Author]:
# Get mapping between affiliation ids and names
affiliation_names = []
for affiliation_node in self.tree.xpath(".//aff[@id]"):
affiliation_names.append(
": ".join([t for t in affiliation_node.itertext() if t != "\n"])
)
affiliation_ids_names = {
id: name
for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
}
# Get author names and affiliation names
authors: list[Author] = []
for author_node in self.tree.xpath(
'.//contrib-group/contrib[@contrib-type="author"]'
):
author: Author = {
"name": "",
"affiliation_names": [],
}
# Affiliation names
affiliation_ids = [
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
]
for id in affiliation_ids:
if id in affiliation_ids_names:
author["affiliation_names"].append(affiliation_ids_names[id])
# Name
author["name"] = (
author_node.xpath("name/surname")[0].text
+ " "
+ author_node.xpath("name/given-names")[0].text
)
authors.append(author)
return authors
def _parse_abstract(self) -> str:
texts = []
for abstract_node in self.tree.xpath(".//abstract"):
for text in abstract_node.itertext():
texts.append(text.replace("\n", ""))
abstract: str = "".join(texts)
return abstract
def _parse_main_text(self) -> list[Paragraph]:
paragraphs: list[Paragraph] = []
for paragraph_node in self.tree.xpath("//body//p"):
# Skip captions
if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
continue
paragraph: Paragraph = {"text": "", "headers": []}
# Text
paragraph["text"] = "".join(
[t.replace("\n", "") for t in paragraph_node.itertext()]
)
# Header
path = "../title"
while len(paragraph_node.xpath(path)) > 0:
paragraph["headers"].append(
"".join(
[
t.replace("\n", "")
for t in paragraph_node.xpath(path)[0].itertext()
]
)
)
path = "../" + path
paragraphs.append(paragraph)
return paragraphs
def _parse_tables(self) -> list[Table]:
tables: list[Table] = []
for table_node in self.tree.xpath(".//body//table-wrap"):
table: Table = {"label": "", "caption": "", "content": ""}
# Content
if len(table_node.xpath("table")) > 0:
table_content_node = table_node.xpath("table")[0]
elif len(table_node.xpath("alternatives/table")) > 0:
table_content_node = table_node.xpath("alternatives/table")[0]
else:
table_content_node = None
if table_content_node != None:
table["content"] = etree.tostring(table_content_node).decode("utf-8")
# Caption
if len(table_node.xpath("caption/p")) > 0:
caption_node = table_node.xpath("caption/p")[0]
elif len(table_node.xpath("caption/title")) > 0:
caption_node = table_node.xpath("caption/title")[0]
else:
caption_node = None
if caption_node != None:
table["caption"] = "".join(
[t.replace("\n", "") for t in caption_node.itertext()]
)
# Label
if len(table_node.xpath("label")) > 0:
table["label"] = table_node.xpath("label")[0].text
tables.append(table)
return tables
def _parse_figure_captions(self) -> list[FigureCaption]:
figure_captions: list[FigureCaption] = []
if not (self.tree.xpath(".//fig")):
return figure_captions
for figure_node in self.tree.xpath(".//fig"):
figure_caption: FigureCaption = {
"caption": "",
"label": "",
}
# Label
if figure_node.xpath("label"):
figure_caption["label"] = "".join(
[
t.replace("\n", "")
for t in figure_node.xpath("label")[0].itertext()
]
)
# Caption
if figure_node.xpath("caption"):
caption = ""
for caption_node in figure_node.xpath("caption")[0].getchildren():
caption += (
"".join([t.replace("\n", "") for t in caption_node.itertext()])
+ "\n"
)
figure_caption["caption"] = caption
figure_captions.append(figure_caption)
return figure_captions
def _parse_references(self) -> list[Reference]:
references: list[Reference] = []
for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
reference: Reference = {
"author_names": "",
"title": "",
"journal": "",
"year": "",
}
reference_node: Any = None
for tag in ["mixed-citation", "element-citation", "citation"]:
if len(reference_node_abs.xpath(tag)) > 0:
reference_node = reference_node_abs.xpath(tag)[0]
break
if reference_node is None:
continue
if all(
not (ref_type in ["citation-type", "publication-type"])
for ref_type in reference_node.attrib.keys()
):
continue
# Author names
names = []
if len(reference_node.xpath("name")) > 0:
for name_node in reference_node.xpath("name"):
name_str = " ".join(
[t.text for t in name_node.getchildren() if (t.text != None)]
)
names.append(name_str)
elif len(reference_node.xpath("person-group")) > 0:
for name_node in reference_node.xpath("person-group")[0]:
name_str = (
name_node.xpath("given-names")[0].text
+ " "
+ name_node.xpath("surname")[0].text
)
names.append(name_str)
reference["author_names"] = "; ".join(names)
# Title
if len(reference_node.xpath("article-title")) > 0:
reference["title"] = " ".join(
[
t.replace("\n", " ")
for t in reference_node.xpath("article-title")[0].itertext()
]
)
# Journal
if len(reference_node.xpath("source")) > 0:
reference["journal"] = reference_node.xpath("source")[0].text
# Year
if len(reference_node.xpath("year")) > 0:
reference["year"] = reference_node.xpath("year")[0].text
if (
not (reference_node.xpath("article-title"))
and not (reference_node.xpath("journal"))
and not (reference_node.xpath("year"))
):
reference["title"] = reference_node.text
references.append(reference)
return references
def _parse(self) -> XMLComponents:
"""Parsing PubMed document."""
xml_components: XMLComponents = {
"title": self._parse_title(),
"authors": self._parse_authors(),
"abstract": self._parse_abstract(),
"paragraphs": self._parse_main_text(),
"tables": self._parse_tables(),
"figure_captions": self._parse_figure_captions(),
"references": self._parse_references(),
}
return xml_components
def _populate_document(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> DoclingDocument:
self._add_title(doc, xml_components)
self._add_authors(doc, xml_components)
self._add_abstract(doc, xml_components)
self._add_main_text(doc, xml_components)
if xml_components["tables"]:
self._add_tables(doc, xml_components)
if xml_components["figure_captions"]:
self._add_figure_captions(doc, xml_components)
self._add_references(doc, xml_components)
return doc
def _add_figure_captions(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self.parents["Figures"] = doc.add_heading(
parent=self.parents["Title"], text="Figures"
)
for figure_caption_xml_component in xml_components["figure_captions"]:
figure_caption_text = (
figure_caption_xml_component["label"]
+ ": "
+ figure_caption_xml_component["caption"].strip()
)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=figure_caption_text
)
doc.add_picture(
parent=self.parents["Figures"],
caption=fig_caption,
)
return
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.parents["Title"] = doc.add_text(
parent=None,
text=xml_components["title"],
label=DocItemLabel.TITLE,
)
return
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
authors_affiliations: list = []
for author in xml_components["authors"]:
authors_affiliations.append(author["name"])
authors_affiliations.append(", ".join(author["affiliation_names"]))
authors_affiliations_str = "; ".join(authors_affiliations)
doc.add_text(
parent=self.parents["Title"],
text=authors_affiliations_str,
label=DocItemLabel.PARAGRAPH,
)
return
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
abstract_text: str = xml_components["abstract"]
self.parents["Abstract"] = doc.add_heading(
parent=self.parents["Title"], text="Abstract"
)
doc.add_text(
parent=self.parents["Abstract"],
text=abstract_text,
label=DocItemLabel.TEXT,
)
return
def _add_main_text(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
added_headers: list = []
for paragraph in xml_components["paragraphs"]:
if not (paragraph["headers"]):
continue
# Header
for i, header in enumerate(reversed(paragraph["headers"])):
if header in added_headers:
continue
added_headers.append(header)
if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
i - 1
] in self.parents:
parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
else:
parent = self.parents["Title"]
self.parents[header] = doc.add_heading(parent=parent, text=header)
# Paragraph text
if paragraph["headers"][0] in self.parents:
parent = self.parents[paragraph["headers"][0]]
else:
parent = self.parents["Title"]
doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
return
def _add_references(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self.parents["References"] = doc.add_heading(
parent=self.parents["Title"], text="References"
)
current_list = doc.add_group(
parent=self.parents["References"], label=GroupLabel.LIST, name="list"
)
for reference in xml_components["references"]:
reference_text: str = ""
if reference["author_names"]:
reference_text += reference["author_names"] + ". "
if reference["title"]:
reference_text += reference["title"]
if reference["title"][-1] != ".":
reference_text += "."
reference_text += " "
if reference["journal"]:
reference_text += reference["journal"]
if reference["year"]:
reference_text += " (" + reference["year"] + ")"
if not (reference_text):
_log.debug(f"Skipping reference for: {str(self.file)}")
continue
doc.add_list_item(
text=reference_text, enumerated=False, parent=current_list
)
return
def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.parents["Tables"] = doc.add_heading(
parent=self.parents["Title"], text="Tables"
)
for table_xml_component in xml_components["tables"]:
try:
self._add_table(doc, table_xml_component)
except Exception as e:
_log.debug(f"Skipping unsupported table for: {str(self.file)}")
pass
return
def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
nested_tables = table_tag.find("table")
if nested_tables:
_log.debug(f"Skipping nested table for: {str(self.file)}")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(table_tag.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in table_tag.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(table_tag.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
# Extract and print the text content of each cell
col_idx = 0
for _, html_cell in enumerate(cells):
text = html_cell.text
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] != None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
table_caption = doc.add_text(
label=DocItemLabel.CAPTION,
text=table_xml_component["label"] + ": " + table_xml_component["caption"],
)
doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
return

View File

@ -33,6 +33,7 @@ class InputFormat(str, Enum):
DOCX = "docx" DOCX = "docx"
PPTX = "pptx" PPTX = "pptx"
HTML = "html" HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image" IMAGE = "image"
PDF = "pdf" PDF = "pdf"
ASCIIDOC = "asciidoc" ASCIIDOC = "asciidoc"
@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["pdf"], InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"], InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"], InputFormat.XLSX: ["xlsx"],
@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
], ],
InputFormat.HTML: ["text/html", "application/xhtml+xml"], InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"],
InputFormat.IMAGE: [ InputFormat.IMAGE: [
"image/png", "image/png",
"image/jpeg", "image/jpeg",

View File

@ -292,8 +292,7 @@ class _DocumentConversionInput(BaseModel):
mime = mime or "text/plain" mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, []) formats = MimeTypeToFormat.get(mime, [])
if formats: if formats:
# TODO: remove application/xml case after adding another XML parse if len(formats) == 1 and mime not in ("text/plain"):
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
return formats[0] return formats[0]
else: # ambiguity in formats else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content( return _DocumentConversionInput._guess_from_content(
@ -325,6 +324,12 @@ class _DocumentConversionInput(BaseModel):
): ):
input_format = InputFormat.XML_USPTO input_format = InputFormat.XML_USPTO
if (
InputFormat.XML_PUBMED in formats
and "/NLM//DTD JATS" in xml_doctype
):
input_format = InputFormat.XML_PUBMED
elif mime == "text/plain": elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO input_format = InputFormat.XML_USPTO
@ -340,7 +345,6 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.HTML][0] mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]: elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0] mime = FormatToMimeType[InputFormat.MD][0]
return mime return mime
@staticmethod @staticmethod
@ -370,4 +374,10 @@ class _DocumentConversionInput(BaseModel):
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str): if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html" return "text/html"
p = re.compile(
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
)
if p.search(content_str):
return "application/xml"
return None return None

View File

@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
ConversionStatus, ConversionStatus,
@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLPubMedFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
class ImageFormatOption(FormatOption): class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption( InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
), ),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
),
InputFormat.IMAGE: FormatOption( InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
), ),
@ -171,7 +180,6 @@ class DocumentConverter:
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
) -> ConversionResult: ) -> ConversionResult:
all_res = self.convert_all( all_res = self.convert_all(
source=[source], source=[source],
raises_on_error=raises_on_error, raises_on_error=raises_on_error,

View File

@ -0,0 +1,165 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: KRAB-zinc finger protein gene ex ... retrotransposons in the murine lineage
item-2 at level 2: paragraph: Wolf Gernot; 1: The Eunice Kenne ... tes of Health: Bethesda: United States
item-3 at level 2: section_header: Abstract
item-4 at level 3: text: The Krüppel-associated box zinc ... edundant role restricting TE activity.
item-5 at level 2: section_header: Introduction
item-6 at level 3: text: Nearly half of the human and mou ... s are active beyond early development.
item-7 at level 3: text: TEs, especially long terminal re ... f evolutionarily young KRAB-ZFP genes.
item-8 at level 2: section_header: Results
item-9 at level 3: section_header: Mouse KRAB-ZFPs target retrotransposons
item-10 at level 4: text: We analyzed the RNA expression p ... duplications (Kauzlaric et al., 2017).
item-11 at level 4: text: To determine the binding sites o ... ctive in the early embryo (Figure 1A).
item-12 at level 4: text: We generally observed that KRAB- ... responsible for this silencing effect.
item-13 at level 4: text: To further test the hypothesis t ... t easily evade repression by mutation.
item-14 at level 4: text: Our KRAB-ZFP ChIP-seq dataset al ... ntirely shift the mode of DNA binding.
item-15 at level 3: section_header: Genetic deletion of KRAB-ZFP gen ... leads to retrotransposon reactivation
item-16 at level 4: text: The majority of KRAB-ZFP genes a ... ung et al., 2014; Deniz et al., 2018).
item-17 at level 3: section_header: KRAB-ZFP cluster deletions license TE-borne enhancers
item-18 at level 4: text: We next used our RNA-seq dataset ... vating effects of TEs on nearby genes.
item-19 at level 4: text: While we generally observed that ... he internal region and not on the LTR.
item-20 at level 3: section_header: ETn retrotransposition in Chr4-cl KO and WT mice
item-21 at level 4: text: IAP, ETn/ETnERV and MuLV/RLTR4 r ... s may contribute to reduced viability.
item-22 at level 4: text: We reasoned that retrotransposon ... Tn insertions at a high recovery rate.
item-23 at level 4: text: Using this dataset, we first con ... nsertions in our pedigree (Figure 4A).
item-24 at level 4: text: To validate some of the novel ET ... ess might have truncated this element.
item-25 at level 4: text: Besides novel ETn insertions tha ... tions (Figure 4—figure supplement 3D).
item-26 at level 4: text: Finally, we asked whether there ... s clearly also play an important role.
item-27 at level 2: section_header: Discussion
item-28 at level 3: text: C2H2 zinc finger proteins, about ... ) depending upon their insertion site.
item-29 at level 3: text: Despite a lack of widespread ETn ... ion of the majority of KRAB-ZFP genes.
item-30 at level 2: section_header: Materials and methods
item-31 at level 3: section_header: Cell lines and transgenic mice
item-32 at level 4: text: Mouse ES cells and F9 EC cells w ... KO/KO and KO/WT (B6/129 F2) offspring.
item-33 at level 3: section_header: Generation of KRAB-ZFP expressing cell lines
item-34 at level 4: text: KRAB-ZFP ORFs were PCR-amplified ... led and further expanded for ChIP-seq.
item-35 at level 3: section_header: CRISPR/Cas9 mediated deletion of KRAB-ZFP clusters and an MMETn insertion
item-36 at level 4: text: All gRNAs were expressed from th ... PCR genotyping (Supplementary file 3).
item-37 at level 3: section_header: ChIP-seq analysis
item-38 at level 4: text: For ChIP-seq analysis of KRAB-ZF ... 010 or Khil et al., 2012 respectively.
item-39 at level 4: text: ChIP-seq libraries were construc ... were re-mapped using Bowtie (--best).
item-40 at level 3: section_header: Luciferase reporter assays
item-41 at level 4: text: For KRAB-ZFP repression assays, ... after transfection as described above.
item-42 at level 3: section_header: RNA-seq analysis
item-43 at level 4: text: Whole RNA was purified using RNe ... lemented in the R function p.adjust().
item-44 at level 3: section_header: Reduced representation bisulfite sequencing (RRBS-seq)
item-45 at level 4: text: For RRBS-seq analysis, Chr4-cl W ... h sample were considered for analysis.
item-46 at level 3: section_header: Retrotransposition assay
item-47 at level 4: text: The retrotransposition vectors p ... were stained with Amido Black (Sigma).
item-48 at level 3: section_header: Capture-seq screen
item-49 at level 4: text: To identify novel retrotransposo ... assembly using the Unicycler software.
item-50 at level 2: section_header: Tables
item-51 at level 3: table with [9x5]
item-51 at level 4: caption: Table 1.: * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
item-52 at level 3: table with [31x5]
item-52 at level 4: caption: Key resources table:
item-53 at level 2: section_header: Figures
item-54 at level 3: picture
item-54 at level 4: caption: Figure 1.: Genome-wide binding patterns of mouse KRAB-ZFPs.
(A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fishers exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value<1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p<1e-10, peak enrichment >20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
Figure 1—source data 1.KRAB-ZFP expression in 40 mouse tissues and cell lines (ENCODE).Mean values of replicates are shown as log2 transcripts per million.
Figure 1—source data 2.Probability heatmap of KRAB-ZFP binding to TEs.Values corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fishers exact test).
item-55 at level 3: picture
item-55 at level 4: caption: Figure 1—figure supplement 1.: ES cell-specific expression of KRAB-ZFP gene clusters.
(A) Heatmap showing expression patterns of mouse KRAB-ZFPs in 40 mouse tissues and cell lines (ENCODE). Heatmap colors indicate gene expression levels in log2 transcripts per million (TPM). The asterisk indicates a group of 30 KRAB-ZFPs that are exclusively expressed in ES cells. (B) Physical location of the genes encoding for the 30 KRAB-ZFPs that are exclusively expressed in ES cells. (C) Phylogenetic (Maximum likelihood) tree of the KRAB domains of mouse KRAB-ZFPs. KRAB-ZFPs encoded on the gene clusters on chromosome 2 and 4 are highlighted. The scale bar at the bottom indicates amino acid substitutions per site.
item-56 at level 3: picture
item-56 at level 4: caption: Figure 1—figure supplement 2.: KRAB-ZFP binding motifs and their repression activity.
(A) Comparison of computationally predicted (bottom) and experimentally determined (top) KRAB-ZFP binding motifs. Only significant pairs are shown (FDR < 0.1). (B) Luciferase reporter assays to confirm KRAB-ZFP repression of the identified target sites. Bars show the luciferase activity (normalized to Renilla luciferase) of reporter plasmids containing the indicated target sites cloned upstream of the SV40 promoter. Reporter plasmids were co-transfected into 293 T cells with a Renilla luciferase plasmid for normalization and plasmids expressing the targeting KRAB-ZFP. Normalized mean luciferase activity (from three replicates) is shown relative to luciferase activity of the reporter plasmid co-transfected with an empty pcDNA3.1 vector.
item-57 at level 3: picture
item-57 at level 4: caption: Figure 1—figure supplement 3.: KRAB-ZFP binding to ETn retrotransposons.
(A) Comparison of the PBSLys1,2 sequence with Zfp961 binding motifs in nonrepetitive peaks (Nonrep) and peaks at ETn elements. (B) Retrotransposition assays of original (ETnI1-neoTNF and MusD2-neoTNF Ribet et al., 2004) and modified reporter vectors where the Rex2 or Gm13051 binding motifs where removed. Schematic of reporter vectors are displayed at the top. HeLa cells were transfected as described in the Materials and Methods section and neo-resistant colonies, indicating retrotransposition events, were selected and stained. (C) Stem-loop structure of the ETn RNA export signal, the Gm13051 motif on the corresponding DNA is marked with red circles, the part of the motif that was deleted is indicated with grey crosses (adapted from Legiewicz et al., 2010).
item-58 at level 3: picture
item-58 at level 4: caption: Figure 2.: Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells.
(A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
Figure 2—source data 1.Differential H3K9me3 and KAP1 distribution in WT and KRAB-ZFP cluster KO ES cells at TE families and KRAB-ZFP bound TE insertions.Differential read counts and statistical testing were determined by DESeq2.
item-59 at level 3: picture
item-59 at level 4: caption: Figure 2—figure supplement 1.: Epigenetic changes at TEs and TE-borne enhancers in KRAB-ZFP cluster KO ES cells.
(A) Differential analysis of summative (all individual insertions combined) H3K9me3 enrichment at TE groups in Chr10-cl, Chr13.1-cl and Chr13.2-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in orange (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (B) Top: Schematic view of the Cd59a/Cd59b locus with a 5 truncated ETn insertion. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). Bottom: Transcriptional activity of a 5 kb fragment with or without fragments of the ETn insertion was tested by luciferase reporter assay in Chr4-cl WT and KO ES cells.
item-60 at level 3: picture
item-60 at level 4: caption: Figure 3.: TE-dependent gene activation in KRAB-ZFP cluster KO ES cells.
(A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value<0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value<0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5 truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p<0.01, Students t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
item-61 at level 3: picture
item-61 at level 4: caption: Figure 4.: ETn retrotransposition in Chr4-cl KO mice.
(A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
Figure 4—source data 1.Coordinates of identified novel ETn insertions and supporting capture-seq read counts.Genomic regions indicate cluster of supporting reads.
Figure 4—source data 2.Sequences of capture-seq probes used to enrich genomic DNA for ETn and MuLV (RLTR4) insertions.
item-62 at level 3: picture
item-62 at level 4: caption: Figure 4—figure supplement 1.: Birth statistics of KRAB-ZFP cluster KO mice and TE reactivation in adult tissues.
(A) Birth statistics of Chr4- and Chr2-cl mice derived from KO/WT x KO/WT matings in different strain backgrounds. (B) RNA-seq analysis of TE expression in Chr2- (left) and Chr4-cl (right) KO tissues. TE groups with the highest reactivation phenotype in ES cells are shown separately. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. Experiments were performed in at least two biological replicates.
item-63 at level 3: picture
item-63 at level 4: caption: Figure 4—figure supplement 2.: Identification of polymorphic ETn and MuLV retrotransposon insertions in Chr4-cl KO and WT mice.
Heatmaps show normalized capture-seq read counts in RPM (Read Per Million) for identified polymorphic ETn (A) and MuLV (B) loci in different mouse strains. Only loci with strong support for germ line ETn or MuLV insertions (at least 100 or 3000 ETn or MuLV RPM, respectively) in at least two animals are shown. Non-polymorphic insertion loci with high read counts in all screened mice were excluded for better visibility. The sample information (sample name and cell type/tissue) is annotated at the bottom, with the strain information indicated by color at the top. The color gradient indicates log10(RPM+1).
item-64 at level 3: picture
item-64 at level 4: caption: Figure 4—figure supplement 3.: Confirmation of novel ETn insertions identified by capture-seq.
(A) PCR validation of novel ETn insertions in genomic DNA of three littermates (IDs: T09673, T09674 and T00436) and their parents (T3913 and T3921). Primer sequences are shown in Supplementary file 3. (B) ETn capture-seq read counts (RPM) at putative novel somatic (loci identified exclusively in one single animal), novel germ line (loci identified in several littermates) insertions, and at B6 reference ETn elements. (C) Heatmap shows capture-seq read counts (RPM) of a Chr4-cl KO mouse (ID: C6733) as determined in different tissues. Each row represents a novel ETn locus that was identified in at least one tissue. The color gradient indicates log10(RPM+1). (D) Heatmap shows the capture-seq RPM in technical replicates using the same Chr4-cl KO DNA sample (rep1/rep2) or replicates with DNA samples prepared from different sections of the tail from the same mouse at different ages (tail1/tail2). Each row represents a novel ETn locus that was identified in at least one of the displayed samples. The color gradient indicates log10(RPM+1).
item-65 at level 2: section_header: References
item-66 at level 3: list: group list
item-67 at level 4: list_item: TL Bailey; M Boden; FA Buske; M ... arching. Nucleic Acids Research (2009)
item-68 at level 4: list_item: C Baust; L Gagnier; GJ Baillie; ... the mouse. Journal of Virology (2003)
item-69 at level 4: list_item: K Blaschke; KT Ebata; MM Karimi; ... -like state in ES cells. Nature (2013)
item-70 at level 4: list_item: A Brodziak; E Ziółko; M Muc-Wier ... erimental and Clinical Research (2012)
item-71 at level 4: list_item: N Castro-Diaz; G Ecco; A Colucci ... stem cells. Genes & Development (2014)
item-72 at level 4: list_item: EB Chuong; NC Elde; C Feschotte. ... ndogenous retroviruses. Science (2016)
item-73 at level 4: list_item: J Dan; Y Liu; N Liu; M Chiourea; ... n silencing. Developmental Cell (2014)
item-74 at level 4: list_item: A De Iaco; E Planet; A Coluccio; ... cental mammals. Nature Genetics (2017)
item-75 at level 4: list_item: Ö Deniz; L de la Rica; KCL Cheng ... onic stem cells. Genome Biology (2018)
item-76 at level 4: list_item: M Dewannieux; T Heidmann. Endoge ... rs. Current Opinion in Virology (2013)
item-77 at level 4: list_item: G Ecco; M Cassano; A Kauzlaric; ... ult tissues. Developmental Cell (2016)
item-78 at level 4: list_item: G Ecco; M Imbeault; D Trono. KRAB zinc finger proteins. Development (2017)
item-79 at level 4: list_item: JA Frank; C Feschotte. Co-option ... on. Current Opinion in Virology (2017)
item-80 at level 4: list_item: L Gagnier; VP Belancio; DL Mager ... ansposon insertions. Mobile DNA (2019)
item-81 at level 4: list_item: AC Groner; S Meylan; A Ciuffi; N ... omatin spreading. PLOS Genetics (2010)
item-82 at level 4: list_item: DC Hancks; HH Kazazian. Roles fo ... ns in human disease. Mobile DNA (2016)
item-83 at level 4: list_item: M Imbeault; PY Helleboid; D Tron ... ene regulatory networks. Nature (2017)
item-84 at level 4: list_item: FM Jacobs; D Greenberg; N Nguyen ... SVA/L1 retrotransposons. Nature (2014)
item-85 at level 4: list_item: H Kano; H Kurahashi; T Toda. Gen ... e dactylaplasia phenotype. PNAS (2007)
item-86 at level 4: list_item: MM Karimi; P Goyal; IA Maksakova ... cripts in mESCs. Cell Stem Cell (2011)
item-87 at level 4: list_item: A Kauzlaric; G Ecco; M Cassano; ... related genetic units. PLOS ONE (2017)
item-88 at level 4: list_item: PP Khil; F Smagulova; KM Brick; ... ction of ssDNA. Genome Research (2012)
item-89 at level 4: list_item: F Krueger; SR Andrews. Bismark: ... eq applications. Bioinformatics (2011)
item-90 at level 4: list_item: B Langmead; SL Salzberg. Fast ga ... t with bowtie 2. Nature Methods (2012)
item-91 at level 4: list_item: M Legiewicz; AS Zolotukhin; GR P ... Journal of Biological Chemistry (2010)
item-92 at level 4: list_item: JA Lehoczky; PE Thomas; KM Patri ... n Polypodia mice. PLOS Genetics (2013)
item-93 at level 4: list_item: D Leung; T Du; U Wagner; W Xie; ... methyltransferase Setdb1. PNAS (2014)
item-94 at level 4: list_item: J Lilue; AG Doran; IT Fiddes; M ... unctional loci. Nature Genetics (2018)
item-95 at level 4: list_item: S Liu; J Brind'Amour; MM Karimi; ... germ cells. Genes & Development (2014)
item-96 at level 4: list_item: MI Love; W Huber; S Anders. Mode ... ata with DESeq2. Genome Biology (2014)
item-97 at level 4: list_item: F Lugani; R Arora; N Papeta; A P ... short tail mouse. PLOS Genetics (2013)
item-98 at level 4: list_item: TS Macfarlan; WD Gifford; S Dris ... ous retrovirus activity. Nature (2012)
item-99 at level 4: list_item: IA Maksakova; MT Romanish; L Gag ... mouse germ line. PLOS Genetics (2006)
item-100 at level 4: list_item: T Matsui; D Leung; H Miyashita; ... methyltransferase ESET. Nature (2010)
item-101 at level 4: list_item: HS Najafabadi; S Mnaimneh; FW Sc ... y lexicon. Nature Biotechnology (2015)
item-102 at level 4: list_item: C Nellåker; TM Keane; B Yalcin; ... 8 mouse strains. Genome Biology (2012)
item-103 at level 4: list_item: H O'Geen; S Frietze; PJ Farnham. ... s. Methods in Molecular Biology (2010)
item-104 at level 4: list_item: A Patel; P Yang; M Tinkham; M Pr ... ndem zinc finger proteins. Cell (2018)
item-105 at level 4: list_item: D Ribet; M Dewannieux; T Heidman ... s-mobilization. Genome Research (2004)
item-106 at level 4: list_item: SR Richardson; P Gerdes; DJ Gerh ... d early embryo. Genome Research (2017)
item-107 at level 4: list_item: HM Rowe; J Jakobsson; D Mesnard; ... in embryonic stem cells. Nature (2010)
item-108 at level 4: list_item: HM Rowe; A Kapopoulou; A Corsino ... nic stem cells. Genome Research (2013)
item-109 at level 4: list_item: SN Schauer; PE Carreira; R Shukl ... carcinogenesis. Genome Research (2018)
item-110 at level 4: list_item: DC Schultz; K Ayyanathan; D Nego ... r proteins. Genes & Development (2002)
item-111 at level 4: list_item: K Semba; K Araki; K Matsumoto; H ... short tail mice. PLOS Genetics (2013)
item-112 at level 4: list_item: SP Sripathy; J Stevens; DC Schul ... Molecular and Cellular Biology (2006)
item-113 at level 4: list_item: JH Thomas; S Schneider. Coevolut ... c finger genes. Genome Research (2011)
item-114 at level 4: list_item: PJ Thompson; TS Macfarlan; MC Lo ... tory repertoire. Molecular Cell (2016)
item-115 at level 4: list_item: RS Treger; SD Pope; Y Kong; M To ... irus expression SNERV. Immunity (2019)
item-116 at level 4: list_item: CN Vlangos; AN Siuniak; D Robins ... Ptf1a expression. PLOS Genetics (2013)
item-117 at level 4: list_item: J Wang; G Xie; M Singh; AT Ghanb ... s naive-like stem cells. Nature (2014)
item-118 at level 4: list_item: D Wolf; K Hug; SP Goff. TRIM28 m ... iruses in embryonic cells. PNAS (2008)
item-119 at level 4: list_item: G Wolf; D Greenberg; TS Macfarla ... ger protein family. Mobile DNA (2015a)
item-120 at level 4: list_item: G Wolf; P Yang; AC Füchtbauer; E ... roviruses. Genes & Development (2015b)
item-121 at level 4: list_item: M Yamauchi; B Freitag; C Khan; B ... silencers. Journal of Virology (1995)
item-122 at level 4: list_item: Y Zhang; T Liu; CA Meyer; J Eeck ... ChIP-Seq (MACS). Genome Biology (2008)
item-123 at level 1: caption: Table 1.: * Number of protein-co ... ChIP-seq was performed in this study.
item-124 at level 1: caption: Key resources table:
item-125 at level 1: caption: Figure 1.: Genome-wide binding p ... with TE groups (Fishers exact test).
item-126 at level 1: caption: Figure 1—figure supplement 1.: E ... tes amino acid substitutions per site.
item-127 at level 1: caption: Figure 1—figure supplement 2.: K ... sfected with an empty pcDNA3.1 vector.
item-128 at level 1: caption: Figure 1—figure supplement 3.: K ... (adapted from Legiewicz et al., 2010).
item-129 at level 1: caption: Figure 2.: Retrotransposon react ... cal testing were determined by DESeq2.
item-130 at level 1: caption: Figure 2—figure supplement 1.: E ... r assay in Chr4-cl WT and KO ES cells.
item-131 at level 1: caption: Figure 3.: TE-dependent gene act ... Gm13051 are indicated by dashed lines.
item-132 at level 1: caption: Figure 4.: ETn retrotranspositio ... A for ETn and MuLV (RLTR4) insertions.
item-133 at level 1: caption: Figure 4—figure supplement 1.: B ... in at least two biological replicates.
item-134 at level 1: caption: Figure 4—figure supplement 2.: I ... color gradient indicates log10(RPM+1).
item-135 at level 1: caption: Figure 4—figure supplement 3.: C ... color gradient indicates log10(RPM+1).

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,268 @@
# KRAB-zinc finger protein gene expansion in response to active retrotransposons in the murine lineage
Wolf Gernot; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; de Iaco Alberto; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Sun Ming-An; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Bruno Melania; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Tinkham Matthew; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Hoang Don; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Mitra Apratim; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Ralls Sherry; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States; Trono Didier; 2: School of Life Sciences, École Polytechnique Fédérale de Lausanne (EPFL): Lausanne: Switzerland; Macfarlan Todd S; 1: The Eunice Kennedy Shriver National Institute of Child Health and Human Development, The National Institutes of Health: Bethesda: United States
## Abstract
The Krüppel-associated box zinc finger protein (KRAB-ZFP) family diversified in mammals. The majority of human KRAB-ZFPs bind transposable elements (TEs), however, since most TEs are inactive in humans it is unclear whether KRAB-ZFPs emerged to suppress TEs. We demonstrate that many recently emerged murine KRAB-ZFPs also bind to TEs, including the active ETn, IAP, and L1 families. Using a CRISPR/Cas9-based engineering approach, we genetically deleted five large clusters of KRAB-ZFPs and demonstrate that target TEs are de-repressed, unleashing TE-encoded enhancers. Homozygous knockout mice lacking one of two KRAB-ZFP gene clusters on chromosome 2 and chromosome 4 were nonetheless viable. In pedigrees of chromosome 4 cluster KRAB-ZFP mutants, we identified numerous novel ETn insertions with a modest increase in mutants. Our data strongly support the current model that recent waves of retrotransposon activity drove the expansion of KRAB-ZFP genes in mice and that many KRAB-ZFPs play a redundant role restricting TE activity.
## Introduction
Nearly half of the human and mouse genomes consist of transposable elements (TEs). TEs shape the evolution of species, serving as a source for genetic innovation (Chuong et al., 2016; Frank and Feschotte, 2017). However, TEs also potentially harm their hosts by insertional mutagenesis, gene deregulation and activation of innate immunity (Maksakova et al., 2006; Kano et al., 2007; Brodziak et al., 2012; Hancks and Kazazian, 2016). To protect themselves from TE activity, host organisms have developed a wide range of defense mechanisms targeting virtually all steps of the TE life cycle (Dewannieux and Heidmann, 2013). In tetrapods, KRAB zinc finger protein (KRAB-ZFP) genes have amplified and diversified, likely in response to TE colonization (Thomas and Schneider, 2011; Najafabadi et al., 2015; Wolf et al., 2015a; Wolf et al., 2015b; Imbeault et al., 2017). Conventional ZFPs bind DNA using tandem arrays of C2H2 zinc finger domains, each capable of specifically interacting with three nucleotides, whereas some zinc fingers can bind two or four nucleotides and include DNA backbone interactions depending on target DNA structure (Patel et al., 2018). This allows KRAB-ZFPs to flexibly bind to large stretches of DNA with high affinity. The KRAB domain binds the corepressor KAP1, which in turn recruits histone modifying enzymes including the NuRD histone deacetylase complex and the H3K9-specific methylase SETDB1 (Schultz et al., 2002; Sripathy et al., 2006), which induces persistent and heritable gene silencing (Groner et al., 2010). Deletion of KAP1 (Rowe et al., 2010) or SETDB1 (Matsui et al., 2010) in mouse embryonic stem (ES) cells induces TE reactivation and cell death, but only minor phenotypes in differentiated cells, suggesting KRAB-ZFPs are most important during early embryogenesis where they mark TEs for stable epigenetic silencing that persists through development. However, SETDB1-containing complexes are also required to repress TEs in primordial germ cells (Liu et al., 2014) and adult tissues (Ecco et al., 2016), indicating KRAB-ZFPs are active beyond early development.
TEs, especially long terminal repeat (LTR) retrotransposons, also known as endogenous retroviruses (ERVs), can affect expression of neighboring genes through their promoter and enhancer functions (Macfarlan et al., 2012; Wang et al., 2014; Thompson et al., 2016). KAP1 deletion in mouse ES cells causes rapid gene deregulation (Rowe et al., 2013), indicating that KRAB-ZFPs may regulate gene expression by recruiting KAP1 to TEs. Indeed, Zfp809 knock-out (KO) in mice resulted in transcriptional activation of a handful of genes in various tissues adjacent to ZFP809-targeted VL30-Pro elements (Wolf et al., 2015b). It has therefore been speculated that KRAB-ZFPs bind to TE sequences to domesticate them for gene regulatory innovation (Ecco et al., 2017). This idea is supported by the observation that many human KRAB-ZFPs target TE groups that have lost their coding potential millions of years ago and that KRAB-ZFP target sequences within TEs are in some cases under purifying selection (Imbeault et al., 2017). However, there are also clear signs of an evolutionary arms-race between human TEs and KRAB-ZFPs (Jacobs et al., 2014), indicating that some KRAB-ZFPs may limit TE mobility for stretches of evolutionary time, prior to their ultimate loss from the genome or adaptation for other regulatory functions. Here we use the laboratory mouse, which has undergone a recent expansion of the KRAB-ZFP family, to determine the in vivo requirement of the majority of evolutionarily young KRAB-ZFP genes.
## Results
### Mouse KRAB-ZFPs target retrotransposons
We analyzed the RNA expression profiles of mouse KRAB-ZFPs across a wide range of tissues to identify candidates active in early embryos/ES cells. While the majority of KRAB-ZFPs are expressed at low levels and uniformly across tissues, a group of KRAB-ZFPs are highly and almost exclusively expressed in ES cells (Figure 1—figure supplement 1A). About two thirds of these KRAB-ZFPs are physically linked in two clusters on chromosome 2 (Chr2-cl) and 4 (Chr4-cl) (Figure 1—figure supplement 1B). These two clusters encode 40 and 21 KRAB-ZFP annotated genes, respectively, which, with one exception on Chr4-cl, do not have orthologues in rat or any other sequenced mammals (Supplementary file 1). The KRAB-ZFPs within these two genomic clusters also group together phylogenetically (Figure 1—figure supplement 1C), indicating these gene clusters arose by a series of recent segmental gene duplications (Kauzlaric et al., 2017).
To determine the binding sites of the KRAB-ZFPs within these and other gene clusters, we expressed epitope-tagged KRAB-ZFPs using stably integrating vectors in mouse embryonic carcinoma (EC) or ES cells (Table 1, Supplementary file 1) and performed chromatin immunoprecipitation followed by deep sequencing (ChIP-seq). We then determined whether the identified binding sites are significantly enriched over annotated TEs and used the non-repetitive peak fraction to identify binding motifs. We discarded 7 of 68 ChIP-seq datasets because we could not obtain a binding motif or a target TE and manual inspection confirmed low signal to noise ratio. Of the remaining 61 KRAB-ZFPs, 51 significantly overlapped at least one TE subfamily (adjusted p-value<1e-5). Altogether, 81 LTR retrotransposon, 18 LINE, 10 SINE and one DNA transposon subfamilies were targeted by at least one of the 51 KRAB-ZFPs (Figure 1A and Supplementary file 1). Chr2-cl KRAB-ZFPs preferably bound IAPEz retrotransposons and L1-type LINEs, while Chr4-cl KRAB-ZFPs targeted various retrotransposons, including the closely related MMETn (hereafter referred to as ETn) and ETnERV (also known as MusD) elements (Figure 1A). ETn elements are non-autonomous LTR retrotransposons that require trans-complementation by the fully coding ETnERV elements that contain Gag, Pro and Pol genes (Ribet et al., 2004). These elements have accumulated to ~240 and~100 copies in the reference C57BL/6 genome, respectively, with ~550 solitary LTRs (Baust et al., 2003). Both ETn and ETnERVs are still active, generating polymorphisms and mutations in several mouse strains (Gagnier et al., 2019). The validity of our ChIP-seq screen was confirmed by the identification of binding motifs - which often resembled the computationally predicted motifs (Figure 1figure supplement 2A) - for the majority of screened KRAB-ZFPs (Supplementary file 1). Moreover, predicted and experimentally determined motifs were found in targeted TEs in most cases (Supplementary file 1), and reporter repression assays confirmed KRAB-ZFP induced silencing for all the tested sequences (Figure 1figure supplement 2B). Finally, we observed KAP1 and H3K9me3 enrichment at most of the targeted TEs in wild type ES cells, indicating that most of these KRAB-ZFPs are functionally active in the early embryo (Figure 1A).
We generally observed that KRAB-ZFPs present exclusively in mouse target TEs that are restricted to the mouse genome, indicating KRAB-ZFPs and their targets emerged together. For example, several mouse-specific KRAB-ZFPs in Chr2-cl and Chr4-cl target IAP and ETn elements which are only found in the mouse genome and are highly active. This is the strongest data to date supporting that recent KRAB-ZFP expansions in these young clusters is a response to recent TE activity. Likewise, ZFP599 and ZFP617, both conserved in Muroidea, bind to various ORR1-type LTRs which are present in the rat genome (Supplementary file 1). However, ZFP961, a KRAB-ZFP encoded on a small gene cluster on chromosome 8 that is conserved in Muroidea targets TEs that are only found in the mouse genome (e.g. ETn), a paradox we have previously observed with ZFP809, which also targets TEs that are evolutionarily younger than itself (Wolf et al., 2015b). The ZFP961 binding site is located at the 5 end of the internal region of ETn and ETnERV elements, a sequence that usually contains the primer binding site (PBS), which is required to prime retroviral reverse transcription. Indeed, the ZFP961 motif closely resembles the PBSLys1,2 (Figure 1—figure supplement 3A), which had been previously identified as a KAP1-dependent target of retroviral repression (Yamauchi et al., 1995; Wolf et al., 2008). Repression of the PBSLys1,2 by ZFP961 was also confirmed in reporter assays (Figure 1—figure supplement 2B), indicating that ZFP961 is likely responsible for this silencing effect.
To further test the hypothesis that KRAB-ZFPs target sites necessary for retrotransposition, we utilized previously generated ETn and ETnERV retrotransposition reporters in which we mutated KRAB-ZFP binding sites (Ribet et al., 2004). Whereas the ETnERV reporters are sufficient for retrotransposition, the ETn reporter requires ETnERV genes supplied in trans. We tested and confirmed that the REX2/ZFP600 and GM13051 binding sites within these TEs are required for efficient retrotransposition (Figure 1—figure supplement 3B). REX2 and ZFP600 both bind a target about 200 bp from the start of the internal region (Figure 1B), a region that often encodes the packaging signal. GM13051 binds a target coding for part of a highly structured mRNA export signal (Legiewicz et al., 2010) near the 3 end of the internal region of ETn (Figure 1—figure supplement 3C). Both signals are characterized by stem-loop intramolecular base-pairing in which a single mutation can disrupt loop formation. This indicates that at least some KRAB-ZFPs evolved to bind functionally essential target sequences which cannot easily evade repression by mutation.
Our KRAB-ZFP ChIP-seq dataset also provided unique insights into the emergence of new KRAB-ZFPs and binding patterns. The Chr4-cl KRAB-ZFPs REX2 and ZFP600 bind to the same target within ETn but with varying affinity (Figure 1C). Comparison of the amino acids responsible for DNA contact revealed a high similarity between REX2 and ZFP600, with the main differences at the most C-terminal zinc fingers. Additionally, we found that GM30910, another KRAB-ZFP encoded in the Chr4-cl, also shows a strong similarity to both KRAB-ZFPs yet targets entirely different groups of TEs (Figure 1C and Supplementary file 1). Together with previously shown data (Ecco et al., 2016), this example highlights how addition of a few new zinc fingers to an existing array can entirely shift the mode of DNA binding.
### Genetic deletion of KRAB-ZFP gene clusters leads to retrotransposon reactivation
The majority of KRAB-ZFP genes are harbored in large, highly repetitive clusters that have formed by successive complex segmental duplications (Kauzlaric et al., 2017), rendering them inaccessible to conventional gene targeting. We therefore developed a strategy to delete entire KRAB-ZFP gene clusters in ES cells (including the Chr2-cl and Chr4-cl as well as two clusters on chromosome 13 and a cluster on chromosome 10) using two CRISPR/Cas9 gRNAs targeting unique regions flanking each cluster, and short single-stranded repair oligos with homologies to both sides of the projected cut sites. Using this approach, we generated five cluster KO ES cell lines in at least two biological replicates and performed RNA sequencing (RNA-seq) to determine TE expression levels. Strikingly, four of the five cluster KO ES cells exhibited distinct TE reactivation phenotypes (Figure 2A). Chr2-cl KO resulted in reactivation of several L1 subfamilies as well as RLTR10 (up to more than 100-fold as compared to WT) and IAPEz ERVs. In contrast, the most strongly upregulated TEs in Chr4-cl KO cells were ETn/ETnERV (up to 10-fold as compared to WT), with several other ERV groups modestly reactivated. ETn/ETnERV elements were also upregulated in Chr13.2-cl KO ES cells while the only upregulated ERVs in Chr13.1-cl KO ES cells were MMERVK10C elements (Figure 2A). Most reactivated retrotransposons were targeted by at least one KRAB-ZFP that was encoded in the deleted cluster (Figure 2A and Supplementary file 1), indicating a direct effect of these KRAB-ZFPs on TE expression levels. Furthermore, we observed a loss of KAP1 binding and H3K9me3 at several TE subfamilies that are targeted by at least one KRAB-ZFP within the deleted Chr2-cl and Chr4-cl (Figure 2B, Figure 2—figure supplement 1A), including L1, ETn and IAPEz elements. Using reduced representation bisulfite sequencing (RRBS-seq), we found that a subset of KRAB-ZFP bound TEs were partially hypomethylated in Chr4-cl KO ES cells, but only when grown in genome-wide hypomethylation-inducing conditions (Blaschke et al., 2013; Figure 2C and Supplementary file 2). These data are consistent with the hypothesis that KRAB-ZFPs/KAP1 are not required to establish DNA methylation, but under certain conditions they protect specific TEs and imprint control regions from genome-wide demethylation (Leung et al., 2014; Deniz et al., 2018).
### KRAB-ZFP cluster deletions license TE-borne enhancers
We next used our RNA-seq datasets to determine the effect of KRAB-ZFP cluster deletions on gene expression. We identified 195 significantly upregulated and 130 downregulated genes in Chr4-cl KO ES cells, and 108 upregulated and 59 downregulated genes in Chr2-cl KO ES cells (excluding genes on the deleted cluster) (Figure 3A). To address whether gene deregulation in Chr2-cl and Chr4-cl KO ES cells is caused by nearby TE reactivation, we determined whether genes near certain TE subfamilies are more frequently deregulated than random genes. We found a strong correlation of gene upregulation and TE proximity for several TE subfamilies, of which many became transcriptionally activated themselves (Figure 3B). For example, nearly 10% of genes that are located within 100 kb (up- or downstream of the TSS) of an ETn element are upregulated in Chr4-cl KO ES cells, as compared to 0.8% of all genes. In Chr2-cl KO ES cells, upregulated genes were significantly enriched near various LINE groups but also IAPEz-int and RLTR10-int elements, indicating that TE-binding KRAB-ZFPs in these clusters limit the potential activating effects of TEs on nearby genes.
While we generally observed that TE-associated gene reactivation is not caused by elongated or spliced transcription starting at the retrotransposons, we did observe that the strength of the effect of ETn elements on gene expression is stronger on genes in closer proximity. About 25% of genes located within 20 kb of an ETn element, but only 5% of genes located at a distance between 50 and 100 kb from the nearest ETn insertion, become upregulated in Chr4-cl KO ES cells. Importantly however, the correlation is still significant for genes that are located at distances between 50 and 100 kb from the nearest ETn insertion, indicating that ETn elements can act as long-range enhancers of gene expression in the absence of KRAB-ZFPs that target them. To confirm that Chr4-cl KRAB-ZFPs such as GM13051 block ETn-borne enhancers, we tested the ability of a putative ETn enhancer to activate transcription in a reporter assay. For this purpose, we cloned a 5 kb fragment spanning from the GM13051 binding site within the internal region of a truncated ETn insertion to the first exon of the Cd59a gene, which is strongly activated in Chr4-cl KO ES cells (Figure 2—figure supplement 1B). We observed strong transcriptional activity of this fragment which was significantly higher in Chr4-cl KO ES cells. Surprisingly, this activity was reduced to background when the internal segment of the ETn element was not included in the fragment, suggesting the internal segment of the ETn element, but not its LTR, contains a Chr4-cl KRAB-ZFP sensitive enhancer. To further corroborate these findings, we genetically deleted an ETn element that is located about 60 kb from the TSS of Chst1, one of the top-upregulated genes in Chr4-cl KO ES cells (Figure 3C). RT-qPCR analysis revealed that the Chst1 upregulation phenotype in Chr4-cl KO ES cells diminishes when the ETn insertion is absent, providing direct evidence that a KRAB-ZFP controlled ETn-borne enhancer regulates Chst1 expression (Figure 3D). Furthermore, ChIP-seq confirmed a general increase of H3K4me3, H3K4me1 and H3K27ac marks at ETn elements in Chr4-cl KO ES cells (Figure 3E). Notably, enhancer marks were most pronounced around the GM13051 binding site near the 3 end of the internal region, confirming that the enhancer activity of ETn is located on the internal region and not on the LTR.
### ETn retrotransposition in Chr4-cl KO and WT mice
IAP, ETn/ETnERV and MuLV/RLTR4 retrotransposons are highly polymorphic in inbred mouse strains (Nellåker et al., 2012), indicating that these elements are able to mobilize in the germ line. Since these retrotransposons are upregulated in Chr2-cl and Chr4-cl KO ES cells, we speculated that these KRAB-ZFP clusters evolved to minimize the risks of insertional mutagenesis by retrotransposition. To test this, we generated Chr2-cl and Chr4-cl KO mice via ES cell injection into blastocysts, and after germ line transmission we genotyped the offspring of heterozygous breeding pairs. While the offspring of Chr4-cl KO/WT parents were born close to Mendelian ratios in pure C57BL/6 and mixed C57BL/6 129Sv matings, one Chr4-cl KO/WT breeding pair gave birth to significantly fewer KO mice than expected (p-value=0.022) (Figure 4—figure supplement 1A). Likewise, two out of four Chr2-cl KO breeding pairs on mixed C57BL/6 129Sv matings failed to give birth to a single KO offspring (p-value<0.01) while the two other mating pairs produced KO offspring at near Mendelian ratios (Figure 4figure supplement 1A). Altogether, these data indicate that KRAB-ZFP clusters are not absolutely essential in mice, but that genetic and/or epigenetic factors may contribute to reduced viability.
We reasoned that retrotransposon activation could account for the reduced viability of Chr2-cl and Chr4-cl KO mice in some matings. However, since only rare matings produced non-viable KO embryos, we instead turned to the viable KO mice to assay for increased transposon activity. RNA-seq in blood, brain and testis revealed that, with a few exceptions, retrotransposons upregulated in Chr2 and Chr4 KRAB-ZFP cluster KO ES cells are not expressed at higher levels in adult tissues (Figure 4—figure supplement 1B). Likewise, no strong transcriptional TE reactivation phenotype was observed in liver and kidney of Chr4-cl KO mice (data not shown) and ChIP-seq with antibodies against H3K4me1, H3K4me3 and H3K27ac in testis of Chr4-cl WT and KO mice revealed no increase of active histone marks at ETn elements or other TEs (data not shown). This indicates that Chr2-cl and Chr4-cl KRAB-ZFPs are primarily required for TE repression during early development. This is consistent with the high expression of these KRAB-ZFPs uniquely in ES cells (Figure 1—figure supplement 1A). To determine whether retrotransposition occurs at a higher frequency in Chr4-cl KO mice during development, we screened for novel ETn (ETn/ETnERV) and MuLV (MuLV/RLTR4\_MM) insertions in viable Chr4-cl KO mice. For this purpose, we developed a capture-sequencing approach to enrich for ETn/MuLV DNA and flanking sequences from genomic DNA using probes that hybridize with the 5 and 3 ends of ETn and MuLV LTRs prior to deep sequencing. We screened genomic DNA samples from a total of 76 mice, including 54 mice from ancestry-controlled Chr4-cl KO matings in various strain backgrounds, the two ES cell lines the Chr4-cl KO mice were generated from, and eight mice from a Chr2-cl KO mating which served as a control (since ETn and MuLVs are not activated in Chr2-cl KO ES cells) (Supplementary file 4). Using this approach, we were able to enrich reads mapping to ETn/MuLV LTRs about 2,000-fold compared to genome sequencing without capture. ETn/MuLV insertions were determined by counting uniquely mapped reads that were paired with reads mapping to ETn/MuLV elements (see materials and methods for details). To assess the efficiency of the capture approach, we determined what proportion of a set of 309 largely intact (two LTRs flanking an internal sequence) reference ETn elements could be identified using our sequencing data. 95% of these insertions were called with high confidence in the majority of our samples (data not shown), indicating that we are able to identify ETn insertions at a high recovery rate.
Using this dataset, we first confirmed the polymorphic nature of both ETn and MuLV retrotransposons in laboratory mouse strains (Figure 4—figure supplement 2A), highlighting the potential of these elements to retrotranspose. To identify novel insertions, we filtered out insertions that were supported by ETn/MuLV-paired reads in more than one animal. While none of the 54 ancestry-controlled mice showed a single novel MuLV insertion, we observed greatly varying numbers of up to 80 novel ETn insertions in our pedigree (Figure 4A).
To validate some of the novel ETn insertions, we designed specific PCR primers for five of the insertions and screened genomic DNA of the mice in which they were identified as well as their parents. For all tested insertions, we were able to amplify their flanking sequence and show that these insertions are absent in their parents (Figure 4—figure supplement 3A). To confirm their identity, we amplified and sequenced three of the novel full-length ETn insertions. Two of these elements (Genbank accession: MH449667-68) resembled typical ETnII elements with identical 5 and 3 LTRs and target site duplications (TSD) of 4 or 6 bp, respectively. The third sequenced element (MH449669) represented a hybrid element that contains both ETnI and MusD (ETnERV) sequences. Similar insertions can be found in the B6 reference genome; however, the identified novel insertion has a 2.5 kb deletion of the 5 end of the internal region. Additionally, the 5 and 3 LTR of this element differ in one nucleotide near the start site and contain an unusually large 248 bp TSD (containing a SINE repeat) indicating that an improper integration process might have truncated this element.
Besides novel ETn insertions that were only identified in one specific animal, we also observed three ETn insertions that could be detected in several siblings but not in their parents or any of the other screened mice. This strongly indicates that these retrotransposition events occurred in the germ line of the parents from which they were passed on to some of their offspring. One of these germ line insertions was evidently passed on from the offspring to the next generation (Figure 4A). As expected, the read numbers supporting these novel germ line insertions were comparable to the read numbers that were found in the flanking regions of annotated B6 ETn insertions (Figure 4—figure supplement 3B). In contrast, virtually all novel insertions that were only found in one animal were supported by significantly fewer reads (Figure 4—figure supplement 3B). This indicates that these elements resulted from retrotransposition events in the developing embryo and not in the zygote or parental germ cells. Indeed, we detected different sets of insertions in various tissues from the same animal (Figure 4—figure supplement 3C). Even between tail samples that were collected from the same animal at different ages, only a fraction of the new insertions were present in both samples, while technical replicates from the same genomic DNA samples showed a nearly complete overlap in insertions (Figure 4—figure supplement 3D).
Finally, we asked whether there were more novel ETn insertions in mice lacking the Chr4-cl relative to their wild type and heterozygous littermates in our pedigree. Interestingly, only one out of the eight Chr4-cl KO mice in a pure C57BL/6 strain background and none of the eight offspring from a Chr2-cl mating carried a single novel ETn insertion (Figure 4A). When crossing into a 129Sv background for a single generation before intercrossing heterozygous mice (F1), we observed 4 out of 8 Chr4-cl KO mice that contained at least one new ETn insertion, whereas none of 3 heterozygous mice contained any insertions. After crossing to the 129Sv background for a second generation (F2), we determined the number of novel ETn insertions in the offspring of one KO/WT x KO and two KO/WT x KO/WT matings, excluding all samples that were not derived from juvenile tail tissue. Only in the offspring of the KO/WT x KO mating, we observed a statistically significant higher average number of ETn insertions in KO vs. KO/WT animals (7.3 vs. 29.6, p=0.045, Figure 4B). Other than that, only a non-significant trend towards greater average numbers of ETn insertions in KO (11 vs. 27.8, p=0.192, Figure 4B) was apparent in one of the WT/KO x KO/WT matings whereas no difference in ETn insertion numbers between WT and KO mice could be observed in the second mating WT/KO x KO/WT (26 vs. 31, p=0.668, Figure 4B). When comparing all KO with all WT and WT/KO mice from these three matings, a trend towards more ETn insertions in KO remained but was not supported by strong significance (26 vs. 13, p=0.057, Figure 4B). Altogether, we observed a high variability in the number of new ETn insertions in both KO and WT but our data suggest that the Chr4-cl KRAB-ZFPs may have a modest effect on ETn retrotransposition rates in some mouse strains but other genetic and epigenetic effects clearly also play an important role.
## Discussion
C2H2 zinc finger proteins, about half of which contain a KRAB repressor domain, represent the largest DNA-binding protein family in mammals. Nevertheless, most of these factors have not been investigated using loss-of-function studies. The most comprehensive characterization of human KRAB-ZFPs revealed a strong preference to bind TEs (Imbeault et al., 2017; Najafabadi et al., 2015) yet their function remains unknown. In humans, very few TEs are capable of retrotransposition yet many of them, often tens of million years old, are bound by KRAB-ZFPs. While this suggests that human KRAB-ZFPs mainly serve to control TE-borne enhancers and may have potentially transcription-independent functions, we were interested in the biological significance of KRAB-ZFPs in restricting potentially active TEs. The mouse is an ideal model for such studies since the mouse genome contains several active TE families, including IAP, ETn and L1 elements. We found that many of the young KRAB-ZFPs present in the genomic clusters of KRAB-ZFPs on chromosomes 2 and 4, which are highly expressed in a restricted pattern in ES cells, bound redundantly to these three active TE families. In several cases, KRAB-ZFPs bound to functionally constrained sequence elements we and others have demonstrated to be necessary for retrotransposition, including PBS and viral packaging signals. Targeting such sequences may help the host defense system keep pace with rapidly evolving mouse transposons. This provides strong evidence that many young KRAB-ZFPs are indeed expanding in response to TE activity. But do these young KRAB-ZFP genes limit the mobilization of TEs? Despite the large number of polymorphic ETn elements in mouse strains (Nellåker et al., 2012) and several reports of phenotype-causing novel ETn germ line insertions, no new ETn insertions were reported in recent screens of C57BL/6 mouse genomes (Richardson et al., 2017; Gagnier et al., 2019), indicating that the overall rate of ETn germ line mobilization in inbred mice is rather low. We have demonstrated that Chr4-cl KRAB-ZFPs control ETn/ETnERV expression in ES cells, but this does not lead to widespread ETn mobility in viable C57BL/6 mice. In contrast, we found numerous novel, including several germ line, ETn insertions in both WT and Chr4-cl KO mice in a C57BL/6 129Sv mixed genetic background, with generally more insertions in KO mice and in mice with more 129Sv DNA. This is consistent with a report detecting ETn insertions in FVB.129 mice (Schauer et al., 2018). Notably, there was a large variation in the number of new insertions in these mice, possibly caused by hyperactive polymorphic ETn insertions that varied from individual to individual, epigenetic variation at ETn insertions between individuals and/or the general stochastic nature of ETn mobilization. Furthermore, recent reports have suggested that KRAB-ZFP gene content is distinct in different strains of laboratory mice (Lilue et al., 2018; Treger et al., 2019), and reduced KRAB-ZFP gene content could contribute to increased activity in individual mice. Although we have yet to find obvious phenotypes in the mice carrying new insertions, novel ETn germ line insertions have been shown to cause phenotypes from short tails (Lugani et al., 2013; Semba et al., 2013; Vlangos et al., 2013) to limb malformation (Kano et al., 2007) and severe morphogenetic defects including polypodia (Lehoczky et al., 2013) depending upon their insertion site.
Despite a lack of widespread ETn activation in Chr4-cl KO mice, it still remains to be determined whether other TEs, like L1, IAP or other LTR retrotransposons are activated in any of the KRAB-ZFP cluster KO mice, which will require the development of additional capture-seq based assays. Notably, two of the heterozygous matings from Chr2-cl KO mice failed to produce viable knockout offspring, which could indicate a TE-reactivation phenotype. It may also be necessary to generate compound homozygous mutants of distinct KRAB-ZFP clusters to eliminate redundancy before TEs become unleashed. The KRAB-ZFP cluster knockouts produced here will be useful reagents to test such hypotheses. In sum, our data supports that a major driver of KRAB-ZFP gene expansion in mice is recent retrotransposon insertions, and that redundancy within the KRAB-ZFP gene family and with other TE restriction pathways provides protection against widespread TE mobility, explaining the non-essential function of the majority of KRAB-ZFP genes.
## Materials and methods
### Cell lines and transgenic mice
Mouse ES cells and F9 EC cells were cultivated as described previously (Wolf et al., 2015b) unless stated otherwise. Chr4-cl KO ES cells originate from B6;129 Gt(ROSA)26Sortm1(cre/ERT)Nat/J mice (Jackson lab), all other KRAB-ZFP cluster KO ES cell lines originate from JM8A3.N1 C57BL/6N-Atm1Brd ES cells (KOMP Repository). Chr2-cl KO and WT ES cells were initially grown in serum-containing media (Wolf et al., 2015b) but changed to 2i media (De Iaco et al., 2017) for several weeks before analysis. To generate Chr4-cl and Chr2-cl KO mice, the cluster deletions were repeated in B6 ES (KOMP repository) or R1 (Nagy lab) ES cells, respectively, and heterozygous clones were injected into B6 albino blastocysts. Chr2-cl KO mice were therefore kept on a mixed B6/Svx129/Sv-CP strain background while Chr4-cl KO mice were initially derived on a pure C57BL/6 background. For capture-seq screens, Chr4-cl KO mice were crossed with 129 × 1/SvJ mice (Jackson lab) to produce the founder mice for Chr4-cl KO and WT (B6/129 F1) offspring. Chr4-cl KO/WT (B6/129 F1) were also crossed with 129 × 1/SvJ mice to get Chr4-cl KO/WT (B6/129 F1) mice, which were intercrossed to give rise to the parents of Chr4-cl KO/KO and KO/WT (B6/129 F2) offspring.
### Generation of KRAB-ZFP expressing cell lines
KRAB-ZFP ORFs were PCR-amplified from cDNA or synthesized with codon-optimization (Supplementary file 1), and stably expressed with 3XFLAG or 3XHA tags in F9 EC or ES cells using Sleeping beauty transposon-based (Wolf et al., 2015b) or lentiviral expression vectors (Imbeault et al., 2017; Supplementary file 1). Cells were selected with puromycin (1 µg/ml) and resistant clones were pooled and further expanded for ChIP-seq.
### CRISPR/Cas9 mediated deletion of KRAB-ZFP clusters and an MMETn insertion
All gRNAs were expressed from the pX330-U6-Chimeric\_BB-CBh-hSpCas9 vector (RRID:Addgene\_42230) and nucleofected into 106 ES cells using Amaxa nucleofection in the following amounts: 5 µg of each pX330-gRNA plasmid, 1 µg pPGK-puro and 500 pmoles single-stranded repair oligos (Supplementary file 3). One day after nucleofection, cells were kept under puromycin selection (1 µg/ml) for 24 hr. Individual KO and WT clones were picked 78 days after nucleofection and expanded for PCR genotyping (Supplementary file 3).
### ChIP-seq analysis
For ChIP-seq analysis of KRAB-ZFP expressing cells, 510 × 107 cells were crosslinked and immunoprecipitated with anti-FLAG (Sigma-Aldrich Cat# F1804, RRID:AB\_262044) or anti-HA (Abcam Cat# ab9110, RRID:AB\_307019 or Covance Cat# MMS-101P-200, RRID:AB\_10064068) antibody using one of two previously described protocols (O'Geen et al., 2010; Imbeault et al., 2017) as indicated in Supplementary file 1. H3K9me3 distribution in Chr4-cl, Chr10-cl, Chr13.1-cl and Chr13.2-cl KO ES cells was determined by native ChIP-seq with anti-H3K9me3 serum (Active Motif Cat# 39161, RRID:AB\_2532132) as described previously (Karimi et al., 2011). In Chr2-cl KO ES cells, H3K9me3 and KAP1 ChIP-seq was performed as previously described (Ecco et al., 2016). In Chr4-cl KO and WT ES cells KAP1 binding was determined by endogenous tagging of KAP1 with C-terminal GFP (Supplementary file 3), followed by FACS to enrich for GFP-positive cells and ChIP with anti-GFP (Thermo Fisher Scientific Cat# A-11122, RRID:AB\_221569) using a previously described protocol (O'Geen et al., 2010). For ChIP-seq analysis of active histone marks, cross-linked chromatin from ES cells or testis (from two-week old mice) was immunoprecipitated with antibodies against H3K4me3 (Abcam Cat# ab8580, RRID:AB\_306649), H3K4me1 (Abcam Cat# ab8895, RRID:AB\_306847) and H3K27ac (Abcam Cat# ab4729, RRID:AB\_2118291) following the protocol developed by O'Geen et al., 2010 or Khil et al., 2012 respectively.
ChIP-seq libraries were constructed and sequenced as indicated in Supplementary file 4. Reads were mapped to the mm9 genome using Bowtie (RRID:SCR\_005476; settings: --best) or Bowtie2 (Langmead and Salzberg, 2012) as indicated in Supplementary file 4. Under these settings, reads that map to multiple genomic regions are assigned to the top-scored match and, if a set of equally good choices is encountered, a pseudo-random number is used to choose one location. Peaks were called using MACS14 (RRID:SCR\_013291) under high stringency settings (p<1e-10, peak enrichment >20) (Zhang et al., 2008). Peaks were called both over the Input control and a FLAG or HA control ChIP (unless otherwise stated in Supplementary file 4) and only peaks that were called in both settings were kept for further analysis. In cases when the stringency settings did not result in at least 50 peaks, the settings were changed to medium (p<1e-10, peak enrichment >10) or low (p<1e-5, peak enrichment >10) stringency (Supplementary file 4). For further analysis, all peaks were scaled to 200 bp regions centered around the peak summits. The overlap of the scaled peaks to each repeat element in UCSC Genome Browser (RRID:SCR\_005780) were calculated by using the bedfisher function (settings: -f 0.25) from BEDTools (RRID:SCR\_006646). The right-tailed p-values between pair-wise comparison of each ChIP-seq peak and repeat element were extracted, and then adjusted using the Benjamini-Hochberg approach implemented in the R function p.adjust(). Binding motifs were determined using only nonrepetitive (<10% repeat content) peaks with MEME (Bailey et al., 2009). MEME motifs were compared with in silico predicted motifs (Najafabadi et al., 2015) using Tomtom (Bailey et al., 2009) and considered as significantly overlapping with a False Discovery Rate (FDR) below 0.1. To find MEME and predicted motifs in repetitive peaks, we used FIMO (Bailey et al., 2009). Differential H3K9me3 and KAP1 distribution in WT and Chr2-cl or Chr4-cl KO ES cells at TEs was determined by counting ChIP-seq reads overlapping annotated insertions of each TE group using BEDTools (MultiCovBed). Additionally, ChIP-seq reads were counted at the TE fraction that was bound by Chr2-cl or Chr4-cl KRAB-ZFPs (overlapping with 200 bp peaks). Count tables were concatenated and analyzed using DESeq2 (Love et al., 2014). The previously published ChIP-seq datasets for KAP1 (Castro-Diaz et al., 2014) and H3K9me3 (Dan et al., 2014) were re-mapped using Bowtie (--best).
### Luciferase reporter assays
For KRAB-ZFP repression assays, double-stranded DNA oligos containing KRAB-ZFP target sequences (Supplementary file 3) were cloned upstream of the SV40 promoter of the pGL3-Promoter vector (Promega) between the restriction sites for NheI and XhoI. 33 ng of reporter vectors were co-transfected (Lipofectamine 2000, Thermofisher) with 33 ng pRL-SV40 (Promega) for normalization and 33 ng of transient KRAB-ZFP expression vectors (in pcDNA3.1) or empty pcDNA3.1 into 293 T cells seeded one day earlier in 96-well plates. Cells were lysed 48 hr after transfection and luciferase/Renilla luciferase activity was measured using the Dual-Luciferase Reporter Assay System (Promega). To measure the transcriptional activity of the MMETn element upstream of the Cd59a gene, fragments of varying sizes (Supplementary file 3) were cloned into the promoter-less pGL3-basic vector (Promega) using NheI and NcoI sites. 70 ng of reporter vectors were cotransfected with 30 ng pRL-SV40 into feeder-depleted Chr4-cl WT and KO ES cells, seeded into a gelatinized 96-well plate 2 hr before transfection. Luciferase activity was measured 48 hr after transfection as described above.
### RNA-seq analysis
Whole RNA was purified using RNeasy columns (Qiagen) with on column DNase treatment or the High Pure RNA Isolation Kit (Roche) (Supplementary file 4). Tissues were first lysed in TRIzol reagent (ThermoFisher) and RNA was purified after the isopropanol precipitation step using RNeasy columns (Qiagen) with on column DNase treatment. Libraries were generated using the SureSelect Strand-Specific RNA Library Prep kit (Agilent) or Illuminas TruSeq RNA Library Prep Kit (with polyA selection) and sequenced as 50 or 100 bp paired-end reads on an Illumina HiSeq2500 (RRID:SCR\_016383) or HiSeq3000 (RRID:SCR\_016386) machine (Supplementary file 4). RNA-seq reads were mapped to the mouse genome (mm9) using Tophat (RRID:SCR\_013035; settings: --I 200000 g 1) unless otherwise stated. These settings allow each mappable read to be reported once, in case the read maps to multiple locations equally well, one match is randomly chosen. For differential transposon expression, mapped reads that overlap with TEs annotated in Repeatmasker (RRID:SCR\_012954) were counted using BEDTools MultiCovBed (setting: -split). Reads mapping to multiple fragments that belong to the same TE insertion (as indicated by the repeat ID) were summed up. Only transposons with a total of at least 20 (for two biological replicates) or 30 (for three biological replicates) mapped reads across WT and KO samples were considered for differential expression analysis. Transposons within the deleted KRAB-ZFP cluster were excluded from the analysis. Read count tables were used for differential expression analysis with DESeq2 (RRID:SCR\_015687). For differential gene expression analysis, reads overlapping with gene exons were counted using HTSeq-count and analyzed using DESeq2. To test if KRAB-ZFP peaks are significantly enriched near up- or down-regulated genes, a binomial test was performed. Briefly, the proportion of the peaks that are located within a certain distance up- or downstream to the TSS of genes was determined using the windowBed function of BED tools. The probability p in the binomial distribution was estimated as the fraction of all genes overlapped with KRAB-ZFP peaks. Then, given n which is the number of specific groups of genes, and x which is the number of this group of genes overlapped with peaks, the R function binom.test() was used to estimate the p-value based on right-tailed Binomial test. Finally, the adjusted p-values were determined separately for LTR and LINE retrotransposon groups using the Benjamini-Hochberg approach implemented in the R function p.adjust().
### Reduced representation bisulfite sequencing (RRBS-seq)
For RRBS-seq analysis, Chr4-cl WT and KO ES cells were grown in either standard ES cell media containing FCS or for one week in 2i media containing vitamin C as described previously (Blaschke et al., 2013). Genomic DNA was purified from WT and Chr4-cl KO ES cells using the Quick-gDNA purification kit (Zymo Research) and bisulfite-converted with the NEXTflex Bisulfite-Seq Kit (Bio Scientific) using Msp1 digestion to fragment DNA. Libraries were sequenced as 50 bp paired-end reads on an Illumina HiSeq. The reads were processed using Trim Galore (--illumina --paired rrbs) to trim poor quality bases and adaptors. Additionally, the first 5 nt of R2 and the last 3 nt of R1 and R2 were trimmed. Reads were then mapped to the reference genome (mm9) using Bismark (Krueger and Andrews, 2011) to extract methylation calling results. The CpG methylation pattern for each covered CpG dyads (two complementary CG dinucleotides) was calculated using a custom script (Source code 1: get\_CpG\_ML.pl). For comparison of CpG methylation between WT and Chr4-cl KO ES cells (in serum or 2i + Vitamin C conditions) only CpG sites with at least 10-fold coverage in each sample were considered for analysis.
### Retrotransposition assay
The retrotransposition vectors pCMV-MusD2, pCMV-MusD2-neoTNF and pCMV-ETnI1-neoTNF (Ribet et al., 2004) were a kind gift from Dixie Mager. To partially delete the Gm13051 binding site within pCMV-MusD2-neoTNF, the vector was cut with KpnI and re-ligated using a repair oligo, leaving a 24 bp deletion within the Gm13051 binding site. The Rex2 binding site in pCMV-ETnI1-neoTNF was deleted by cutting the vector with EcoRI and XbaI followed by re-ligation using two overlapping PCR products, leaving a 45 bp deletion while maintaining the rest of the vector unchanged (see Supplementary file 3 for primer sequences). For MusD retrotransposition assays, 5 × 104 HeLa cells (ATCC CCL-2) were transfected in a 24-well dish with 100 ng pCMV-MusD2-neoTNF or pCMV-MusD2-neoTNF (ΔGm13051-m) using Lipofectamine 2000. For ETn retrotransposition assays, 50 ng of pCMV-ETnI1-neoTNF or pCMV-ETnI1-neoTNF (ΔRex2) vectors were cotransfected with 50 ng pCMV-MusD2 to provide gag and pol proteins in trans. G418 (0.6 mg/ml) was added five days after transfection and cells were grown under selection until colonies were readily visible by eye. G418-resistant colonies were stained with Amido Black (Sigma).
### Capture-seq screen
To identify novel retrotransposon insertions, genomic DNA from various tissues (Supplementary file 4) was purified and used for library construction with target enrichment using the SureSelectQXT Target Enrichment kit (Agilent). Custom RNA capture probes were designed to hybridize with the 120 bp 5 ends of the 5 LTRs and the 120 bp 3 ends of the 3 LTR of about 600 intact (internal region flanked by two LTRs) MMETn/RLTRETN retrotransposons or of 140 RLTR4\_MM/RLTR4 retrotransposons that were upregulated in Chr4-cl KO ES cells (Figure 4—source data 2). Enriched libraries were sequenced on an Illumina HiSeq as paired-end 50 bp reads. R1 and R2 reads were mapped to the mm9 genome separately, using settings that only allow non-duplicated, uniquely mappable reads (Bowtie -m 1 --best --strata; samtools rmdup -s) and under settings that allow multimapping and duplicated reads (Bowtie --best). Of the latter, only reads that overlap (min. 50% of read) with RLTRETN, MMETn-int, ETnERV-int, ETnERV2-int or ETnERV3-int repeats (ETn) or RLTR4, RLTR4\_MM-int or MuLV-int repeats (RLTR4) were kept. Only uniquely mappable reads whose paired reads were overlapping with the repeats mentioned above were used for further analysis. All ETn- and RLTR4-paired reads were then clustered (as bed files) using BEDTools (bedtools merge -i -n -d 1000) to receive a list of all potential annotated and non-annotated new ETn or RLTR4 insertion sites and all overlapping ETn- or RLTR4-paired reads were counted for each sample at each locus. Finally, all regions that were located within 1 kb of an annotated RLTRETN, MMETn-int, ETnERV-int, ETnERV2-int or ETnERV3-int repeat as well as regions overlapping with previously identified polymorphic ETn elements (Nellåker et al., 2012) were removed. Genomic loci with at least 10 reads per million unique ETn- or RLTR4-paired reads were considered as insertion sites. To qualify for a de-novo insertion, we allowed no called insertions in any of the other screened mice at the locus and not a single read at the locus in the ancestors of the mouse. Insertions at the same locus in at least two siblings from the same offspring were considered as germ line insertions, if the insertion was absent in the parents and mice who were not direct descendants from these siblings. Full-length sequencing of new ETn insertions was done by Sanger sequencing of short PCR products in combination with Illumina sequencing of a large PCR product (Supplementary file 3), followed by de-novo assembly using the Unicycler software.
## Tables
Table 1.: * Number of protein-coding KRAB-ZFP genes identified in a previously published screen (Imbeault et al., 2017) and the ChIP-seq data column indicates the number of KRAB-ZFPs for which ChIP-seq was performed in this study.
| Cluster | Location | Size (Mb) | # of KRAB-ZFPs* | ChIP-seq data |
|-----------|------------|-------------|-------------------|-----------------|
| Chr2 | Chr2 qH4 | 3.1 | 40 | 17 |
| Chr4 | Chr4 qE1 | 2.3 | 21 | 19 |
| Chr10 | Chr10 qC1 | 0.6 | 6 | 1 |
| Chr13.1 | Chr13 qB3 | 1.2 | 6 | 2 |
| Chr13.2 | Chr13 qB3 | 0.8 | 26 | 12 |
| Chr8 | Chr8 qB3.3 | 0.1 | 4 | 4 |
| Chr9 | Chr9 qA3 | 0.1 | 4 | 2 |
| Other | - | - | 248 | 4 |
Key resources table:
| Reagent type (species) or resource | Designation | Source or reference | Identifiers | Additional information |
|------------------------------------------|----------------------------------------|-----------------------------------|-------------------------------------|------------------------------------------------------|
| Strain, strain background (Mus musculus) | 129 × 1/SvJ | The Jackson Laboratory | 000691 | Mice used to generate mixed strain Chr4-cl KO mice |
| Cell line (Homo-sapiens) | HeLa | ATCC | ATCC CCL-2 | |
| Cell line (Mus musculus) | JM8A3.N1 C57BL/6N-Atm1Brd | KOMP Repository | PL236745 | B6 ES cells used to generate KO cell lines and mice |
| Cell line (Mus musculus) | B6;129 Gt(ROSA)26Sortm1(cre/ERT)Nat/J | The Jackson Laboratory | 004847 | ES cells used to generate KO cell lines and mice |
| Cell line (Mus musculus) | R1 ES cells | Andras Nagy lab | R1 | 129 ES cells used to generate KO cell lines and mice |
| Cell line (Mus musculus) | F9 Embryonic carcinoma cells | ATCC | ATCC CRL-1720 | |
| Antibody | Mouse monoclonal ANTI-FLAG M2 antibody | Sigma-Aldrich | Cat# F1804, RRID:AB\_262044 | ChIP (1 µg/107 cells) |
| Antibody | Rabbit polyclonal anti-HA | Abcam | Cat# ab9110, RRID:AB\_307019 | ChIP (1 µg/107 cells) |
| Antibody | Mouse monoclonal anti-HA | Covance | Cat# MMS-101P-200, RRID:AB\_10064068 | |
| Antibody | Rabbit polyclonal anti-H3K9me3 | Active Motif | Cat# 39161, RRID:AB\_2532132 | ChIP (3 µl/107 cells) |
| Antibody | Rabbit polyclonal anti-GFP | Thermo Fisher Scientific | Cat# A-11122, RRID:AB\_221569 | ChIP (1 µg/107 cells) |
| Antibody | Rabbit polyclonal anti- H3K4me3 | Abcam | Cat# ab8580, RRID:AB\_306649 | ChIP (1 µg/107 cells) |
| Antibody | Rabbit polyclonal anti- H3K4me1 | Abcam | Cat# ab8895, RRID:AB\_306847 | ChIP (1 µg/107 cells) |
| Antibody | Rabbit polyclonal anti- H3K27ac | Abcam | Cat# ab4729, RRID:AB\_2118291 | ChIP (1 µg/107 cells) |
| Recombinant DNA reagent | pCW57.1 | Addgene | RRID:Addgene\_41393 | Inducible lentiviral expression vector |
| Recombinant DNA reagent | pX330-U6-Chimeric\_BB-CBh-hSpCas9 | Addgene | RRID:Addgene\_42230 | CRISPR/Cas9 expression construct |
| Sequence-based reagent | Chr2-cl KO gRNA.1 | This paper | Cas9 gRNA | GCCGTTGCTCAGTCCAAATG |
| Sequenced-based reagent | Chr2-cl KO gRNA.2 | This paper | Cas9 gRNA | GATACCAGAGGTGGCCGCAAG |
| Sequenced-based reagent | Chr4-cl KO gRNA.1 | This paper | Cas9 gRNA | GCAAAGGGGCTCCTCGATGGA |
| Sequence-based reagent | Chr4-cl KO gRNA.2 | This paper | Cas9 gRNA | GTTTATGGCCGTGCTAAGGTC |
| Sequenced-based reagent | Chr10-cl KO gRNA.1 | This paper | Cas9 gRNA | GTTGCCTTCATCCCACCGTG |
| Sequenced-based reagent | Chr10-cl KO gRNA.2 | This paper | Cas9 gRNA | GAAGTTCGACTTGGACGGGCT |
| Sequenced-based reagent | Chr13.1-cl KO gRNA.1 | This paper | Cas9 gRNA | GTAACCCATCATGGGCCCTAC |
| Sequenced-based reagent | Chr13.1-cl KO gRNA.2 | This paper | Cas9 gRNA | GGACAGGTTATAGGTTTGAT |
| Sequenced-based reagent | Chr13.2-cl KO gRNA.1 | This paper | Cas9 gRNA | GGGTTTCTGAGAAACGTGTA |
| Sequenced-based reagent | Chr13.2-cl KO gRNA.2 | This paper | Cas9 gRNA | GTGTAATGAGTTCTTATATC |
| Commercial assay or kit | SureSelectQXT Target Enrichment kit | Agilent | G9681-90000 | |
| Software, algorithm | Bowtie | http://bowtie-bio.sourceforge.net | RRID:SCR\_005476 | |
| Software, algorithm | MACS14 | https://bio.tools/macs | RRID:SCR\_013291 | |
| Software, algorithm | Tophat | https://ccb.jhu.edu | RRID:SCR\_013035 | |
## Figures
Figure 1.: Genome-wide binding patterns of mouse KRAB-ZFPs.
(A) Probability heatmap of KRAB-ZFP binding to TEs. Blue color intensity (main field) corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fishers exact test). The green/red color intensity (top panel) represents mean KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) enrichment (respectively) at peaks overlapping significantly targeted TEs (adjusted p-value<1e-5) in WT ES cells. (B) Summarized ChIP-seq signal for indicated KRAB-ZFPs and previously published KAP1 and H3K9me3 in WT ES cells across 127 intact ETn elements. (C) Heatmaps of KRAB-ZFP ChIP-seq signal at ChIP-seq peaks. For better comparison, peaks for all three KRAB-ZFPs were called with the same parameters (p<1e-10, peak enrichment >20). The top panel shows a schematic of the arrangement of the contact amino acid composition of each zinc finger. Zinc fingers are grouped and colored according to similarity, with amino acid differences relative to the five consensus fingers highlighted in white.
Figure 1—source data 1.KRAB-ZFP expression in 40 mouse tissues and cell lines (ENCODE).Mean values of replicates are shown as log2 transcripts per million.
Figure 1—source data 2.Probability heatmap of KRAB-ZFP binding to TEs.Values corresponds to -log10 (adjusted p-value) enrichment of ChIP-seq peak overlap with TE groups (Fishers exact test).
<!-- image -->
Figure 1—figure supplement 1.: ES cell-specific expression of KRAB-ZFP gene clusters.
(A) Heatmap showing expression patterns of mouse KRAB-ZFPs in 40 mouse tissues and cell lines (ENCODE). Heatmap colors indicate gene expression levels in log2 transcripts per million (TPM). The asterisk indicates a group of 30 KRAB-ZFPs that are exclusively expressed in ES cells. (B) Physical location of the genes encoding for the 30 KRAB-ZFPs that are exclusively expressed in ES cells. (C) Phylogenetic (Maximum likelihood) tree of the KRAB domains of mouse KRAB-ZFPs. KRAB-ZFPs encoded on the gene clusters on chromosome 2 and 4 are highlighted. The scale bar at the bottom indicates amino acid substitutions per site.
<!-- image -->
Figure 1—figure supplement 2.: KRAB-ZFP binding motifs and their repression activity.
(A) Comparison of computationally predicted (bottom) and experimentally determined (top) KRAB-ZFP binding motifs. Only significant pairs are shown (FDR < 0.1). (B) Luciferase reporter assays to confirm KRAB-ZFP repression of the identified target sites. Bars show the luciferase activity (normalized to Renilla luciferase) of reporter plasmids containing the indicated target sites cloned upstream of the SV40 promoter. Reporter plasmids were co-transfected into 293 T cells with a Renilla luciferase plasmid for normalization and plasmids expressing the targeting KRAB-ZFP. Normalized mean luciferase activity (from three replicates) is shown relative to luciferase activity of the reporter plasmid co-transfected with an empty pcDNA3.1 vector.
<!-- image -->
Figure 1—figure supplement 3.: KRAB-ZFP binding to ETn retrotransposons.
(A) Comparison of the PBSLys1,2 sequence with Zfp961 binding motifs in nonrepetitive peaks (Nonrep) and peaks at ETn elements. (B) Retrotransposition assays of original (ETnI1-neoTNF and MusD2-neoTNF Ribet et al., 2004) and modified reporter vectors where the Rex2 or Gm13051 binding motifs where removed. Schematic of reporter vectors are displayed at the top. HeLa cells were transfected as described in the Materials and Methods section and neo-resistant colonies, indicating retrotransposition events, were selected and stained. (C) Stem-loop structure of the ETn RNA export signal, the Gm13051 motif on the corresponding DNA is marked with red circles, the part of the motif that was deleted is indicated with grey crosses (adapted from Legiewicz et al., 2010).
<!-- image -->
Figure 2.: Retrotransposon reactivation in KRAB-ZFP cluster KO ES cells.
(A) RNA-seq analysis of TE expression in five KRAB-ZFP cluster KO ES cells. Green and grey squares on top of the panel represent KRAB-ZFPs with or without ChIP-seq data, respectively, within each deleted gene cluster. Reactivated TEs that are bound by one or several KRAB-ZFPs are indicated by green squares in the panel. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. (B) Differential KAP1 binding and H3K9me3 enrichment at TE groups (summarized across all insertions) in Chr2-cl and Chr4-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in blue (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (C) DNA methylation status of CpG sites at indicated TE groups in WT and Chr4-cl KO ES cells grown in serum containing media or in hypomethylation-inducing media (2i + Vitamin C). P-values were calculated using paired t-test.
Figure 2—source data 1.Differential H3K9me3 and KAP1 distribution in WT and KRAB-ZFP cluster KO ES cells at TE families and KRAB-ZFP bound TE insertions.Differential read counts and statistical testing were determined by DESeq2.
<!-- image -->
Figure 2—figure supplement 1.: Epigenetic changes at TEs and TE-borne enhancers in KRAB-ZFP cluster KO ES cells.
(A) Differential analysis of summative (all individual insertions combined) H3K9me3 enrichment at TE groups in Chr10-cl, Chr13.1-cl and Chr13.2-cl KO ES cells. TE groups targeted by one or several KRAB-ZFPs encoded within the deleted clusters are highlighted in orange (differential enrichment over the entire TE sequences) and red (differential enrichment at TE regions that overlap with KRAB-ZFP ChIP-seq peaks). (B) Top: Schematic view of the Cd59a/Cd59b locus with a 5 truncated ETn insertion. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). Bottom: Transcriptional activity of a 5 kb fragment with or without fragments of the ETn insertion was tested by luciferase reporter assay in Chr4-cl WT and KO ES cells.
<!-- image -->
Figure 3.: TE-dependent gene activation in KRAB-ZFP cluster KO ES cells.
(A) Differential gene expression in Chr2-cl and Chr4-cl KO ES cells. Significantly up- and downregulated genes (adjusted p-value<0.05) are highlighted in red and green, respectively, KRAB-ZFP genes within the deleted clusters are shown in blue. (B) Correlation of TEs and gene deregulation. Plots show enrichment of TE groups within 100 kb of up- and downregulated genes relative to all genes. Significantly overrepresented LTR and LINE groups (adjusted p-value<0.1) are highlighted in blue and red, respectively. (C) Schematic view of the downstream region of Chst1 where a 5 truncated ETn insertion is located. ChIP-seq (Input subtracted from ChIP) data for overexpressed epitope-tagged Gm13051 (a Chr4-cl KRAB-ZFP) in F9 EC cells, and re-mapped KAP1 (GEO accession: GSM1406445) and H3K9me3 (GEO accession: GSM1327148) in WT ES cells are shown together with RNA-seq data from Chr4-cl WT and KO ES cells (mapped using Bowtie (-a -m 1 --strata -v 2) to exclude reads that cannot be uniquely mapped). (D) RT-qPCR analysis of Chst1 mRNA expression in Chr4-cl WT and KO ES cells with or without the CRISPR/Cas9 deleted ETn insertion near Chst1. Values represent mean expression (normalized to Gapdh) from three biological replicates per sample (each performed in three technical replicates) in arbitrary units. Error bars represent standard deviation and asterisks indicate significance (p<0.01, Students t-test). n.s.: not significant. (E) Mean coverage of ChIP-seq data (Input subtracted from ChIP) in Chr4-cl WT and KO ES cells over 127 full-length ETn insertions. The binding sites of the Chr4-cl KRAB-ZFPs Rex2 and Gm13051 are indicated by dashed lines.
<!-- image -->
Figure 4.: ETn retrotransposition in Chr4-cl KO mice.
(A) Pedigree of mice used for transposon insertion screening by capture-seq in mice of different strain backgrounds. The number of novel ETn insertions (only present in one animal) are indicated. For animals whose direct ancestors have not been screened, the ETn insertions are shown in parentheses since parental inheritance cannot be excluded in that case. Germ line insertions are indicated by asterisks. All DNA samples were prepared from tail tissues unless noted (-S: spleen, -E: ear, -B:Blood) (B) Statistical analysis of ETn insertion frequency in tail tissue from 30 Chr4-cl KO, KO/WT and WT mice that were derived from one Chr4-c KO x KO/WT and two Chr4-cl KO/WT x KO/WT matings. Only DNA samples that were collected from juvenile tails were considered for this analysis. P-values were calculated using one-sided Wilcoxon Rank Sum Test. In the last panel, KO, WT and KO/WT mice derived from all matings were combined for the statistical analysis.
Figure 4—source data 1.Coordinates of identified novel ETn insertions and supporting capture-seq read counts.Genomic regions indicate cluster of supporting reads.
Figure 4—source data 2.Sequences of capture-seq probes used to enrich genomic DNA for ETn and MuLV (RLTR4) insertions.
<!-- image -->
Figure 4—figure supplement 1.: Birth statistics of KRAB-ZFP cluster KO mice and TE reactivation in adult tissues.
(A) Birth statistics of Chr4- and Chr2-cl mice derived from KO/WT x KO/WT matings in different strain backgrounds. (B) RNA-seq analysis of TE expression in Chr2- (left) and Chr4-cl (right) KO tissues. TE groups with the highest reactivation phenotype in ES cells are shown separately. Significantly up- and downregulated elements (adjusted p-value<0.05) are highlighted in red and green, respectively. Experiments were performed in at least two biological replicates.
<!-- image -->
Figure 4—figure supplement 2.: Identification of polymorphic ETn and MuLV retrotransposon insertions in Chr4-cl KO and WT mice.
Heatmaps show normalized capture-seq read counts in RPM (Read Per Million) for identified polymorphic ETn (A) and MuLV (B) loci in different mouse strains. Only loci with strong support for germ line ETn or MuLV insertions (at least 100 or 3000 ETn or MuLV RPM, respectively) in at least two animals are shown. Non-polymorphic insertion loci with high read counts in all screened mice were excluded for better visibility. The sample information (sample name and cell type/tissue) is annotated at the bottom, with the strain information indicated by color at the top. The color gradient indicates log10(RPM+1).
<!-- image -->
Figure 4—figure supplement 3.: Confirmation of novel ETn insertions identified by capture-seq.
(A) PCR validation of novel ETn insertions in genomic DNA of three littermates (IDs: T09673, T09674 and T00436) and their parents (T3913 and T3921). Primer sequences are shown in Supplementary file 3. (B) ETn capture-seq read counts (RPM) at putative novel somatic (loci identified exclusively in one single animal), novel germ line (loci identified in several littermates) insertions, and at B6 reference ETn elements. (C) Heatmap shows capture-seq read counts (RPM) of a Chr4-cl KO mouse (ID: C6733) as determined in different tissues. Each row represents a novel ETn locus that was identified in at least one tissue. The color gradient indicates log10(RPM+1). (D) Heatmap shows the capture-seq RPM in technical replicates using the same Chr4-cl KO DNA sample (rep1/rep2) or replicates with DNA samples prepared from different sections of the tail from the same mouse at different ages (tail1/tail2). Each row represents a novel ETn locus that was identified in at least one of the displayed samples. The color gradient indicates log10(RPM+1).
<!-- image -->
## References
- TL Bailey; M Boden; FA Buske; M Frith; CE Grant; L Clementi; J Ren; WW Li; WS Noble. MEME SUITE: tools for motif discovery and searching. Nucleic Acids Research (2009)
- C Baust; L Gagnier; GJ Baillie; MJ Harris; DM Juriloff; DL Mager. Structure and expression of mobile ETnII retroelements and their coding-competent MusD relatives in the mouse. Journal of Virology (2003)
- K Blaschke; KT Ebata; MM Karimi; JA Zepeda-Martínez; P Goyal; S Mahapatra; A Tam; DJ Laird; M Hirst; A Rao; MC Lorincz; M Ramalho-Santos. Vitamin C induces Tet-dependent DNA demethylation and a blastocyst-like state in ES cells. Nature (2013)
- A Brodziak; E Ziółko; M Muc-Wierzgoń; E Nowakowska-Zajdel; T Kokot; K Klakla. The role of human endogenous retroviruses in the pathogenesis of autoimmune diseases. Medical Science Monitor : International Medical Journal of Experimental and Clinical Research (2012)
- N Castro-Diaz; G Ecco; A Coluccio; A Kapopoulou; B Yazdanpanah; M Friedli; J Duc; SM Jang; P Turelli; D Trono. Evolutionally dynamic L1 regulation in embryonic stem cells. Genes & Development (2014)
- EB Chuong; NC Elde; C Feschotte. Regulatory evolution of innate immunity through co-option of endogenous retroviruses. Science (2016)
- J Dan; Y Liu; N Liu; M Chiourea; M Okuka; T Wu; X Ye; C Mou; L Wang; L Wang; Y Yin; J Yuan; B Zuo; F Wang; Z Li; X Pan; Z Yin; L Chen; DL Keefe; S Gagos; A Xiao; L Liu. Rif1 maintains telomere length homeostasis of ESCs by mediating heterochromatin silencing. Developmental Cell (2014)
- A De Iaco; E Planet; A Coluccio; S Verp; J Duc; D Trono. DUX-family transcription factors regulate zygotic genome activation in placental mammals. Nature Genetics (2017)
- Ö Deniz; L de la Rica; KCL Cheng; D Spensberger; MR Branco. SETDB1 prevents TET2-dependent activation of IAP retroelements in naïve embryonic stem cells. Genome Biology (2018)
- M Dewannieux; T Heidmann. Endogenous retroviruses: acquisition, amplification and taming of genome invaders. Current Opinion in Virology (2013)
- G Ecco; M Cassano; A Kauzlaric; J Duc; A Coluccio; S Offner; M Imbeault; HM Rowe; P Turelli; D Trono. Transposable elements and their KRAB-ZFP controllers regulate gene expression in adult tissues. Developmental Cell (2016)
- G Ecco; M Imbeault; D Trono. KRAB zinc finger proteins. Development (2017)
- JA Frank; C Feschotte. Co-option of endogenous viral sequences for host cell function. Current Opinion in Virology (2017)
- L Gagnier; VP Belancio; DL Mager. Mouse germ line mutations due to retrotransposon insertions. Mobile DNA (2019)
- AC Groner; S Meylan; A Ciuffi; N Zangger; G Ambrosini; N Dénervaud; P Bucher; D Trono. KRAB-zinc finger proteins and KAP1 can mediate long-range transcriptional repression through heterochromatin spreading. PLOS Genetics (2010)
- DC Hancks; HH Kazazian. Roles for retrotransposon insertions in human disease. Mobile DNA (2016)
- M Imbeault; PY Helleboid; D Trono. KRAB zinc-finger proteins contribute to the evolution of gene regulatory networks. Nature (2017)
- FM Jacobs; D Greenberg; N Nguyen; M Haeussler; AD Ewing; S Katzman; B Paten; SR Salama; D Haussler. An evolutionary arms race between KRAB zinc-finger genes ZNF91/93 and SVA/L1 retrotransposons. Nature (2014)
- H Kano; H Kurahashi; T Toda. Genetically regulated epigenetic transcriptional activation of retrotransposon insertion confers mouse dactylaplasia phenotype. PNAS (2007)
- MM Karimi; P Goyal; IA Maksakova; M Bilenky; D Leung; JX Tang; Y Shinkai; DL Mager; S Jones; M Hirst; MC Lorincz. DNA methylation and SETDB1/H3K9me3 regulate predominantly distinct sets of genes, retroelements, and chimeric transcripts in mESCs. Cell Stem Cell (2011)
- A Kauzlaric; G Ecco; M Cassano; J Duc; M Imbeault; D Trono. The mouse genome displays highly dynamic populations of KRAB-zinc finger protein genes and related genetic units. PLOS ONE (2017)
- PP Khil; F Smagulova; KM Brick; RD Camerini-Otero; GV Petukhova. Sensitive mapping of recombination hotspots using sequencing-based detection of ssDNA. Genome Research (2012)
- F Krueger; SR Andrews. Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics (2011)
- B Langmead; SL Salzberg. Fast gapped-read alignment with bowtie 2. Nature Methods (2012)
- M Legiewicz; AS Zolotukhin; GR Pilkington; KJ Purzycka; M Mitchell; H Uranishi; J Bear; GN Pavlakis; SF Le Grice; BK Felber. The RNA transport element of the murine musD retrotransposon requires long-range intramolecular interactions for function. Journal of Biological Chemistry (2010)
- JA Lehoczky; PE Thomas; KM Patrie; KM Owens; LM Villarreal; K Galbraith; J Washburn; CN Johnson; B Gavino; AD Borowsky; KJ Millen; P Wakenight; W Law; ML Van Keuren; G Gavrilina; ED Hughes; TL Saunders; L Brihn; JH Nadeau; JW Innis. A novel intergenic ETnII-β insertion mutation causes multiple malformations in Polypodia mice. PLOS Genetics (2013)
- D Leung; T Du; U Wagner; W Xie; AY Lee; P Goyal; Y Li; KE Szulwach; P Jin; MC Lorincz; B Ren. Regulation of DNA methylation turnover at LTR retrotransposons and imprinted loci by the histone methyltransferase Setdb1. PNAS (2014)
- J Lilue; AG Doran; IT Fiddes; M Abrudan; J Armstrong; R Bennett; W Chow; J Collins; S Collins; A Czechanski; P Danecek; M Diekhans; DD Dolle; M Dunn; R Durbin; D Earl; A Ferguson-Smith; P Flicek; J Flint; A Frankish; B Fu; M Gerstein; J Gilbert; L Goodstadt; J Harrow; K Howe; X Ibarra-Soria; M Kolmogorov; CJ Lelliott; DW Logan; J Loveland; CE Mathews; R Mott; P Muir; S Nachtweide; FCP Navarro; DT Odom; N Park; S Pelan; SK Pham; M Quail; L Reinholdt; L Romoth; L Shirley; C Sisu; M Sjoberg-Herrera; M Stanke; C Steward; M Thomas; G Threadgold; D Thybert; J Torrance; K Wong; J Wood; B Yalcin; F Yang; DJ Adams; B Paten; TM Keane. Sixteen diverse laboratory mouse reference genomes define strain-specific haplotypes and novel functional loci. Nature Genetics (2018)
- S Liu; J Brind'Amour; MM Karimi; K Shirane; A Bogutz; L Lefebvre; H Sasaki; Y Shinkai; MC Lorincz. Setdb1 is required for germline development and silencing of H3K9me3-marked endogenous retroviruses in primordial germ cells. Genes & Development (2014)
- MI Love; W Huber; S Anders. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology (2014)
- F Lugani; R Arora; N Papeta; A Patel; Z Zheng; R Sterken; RA Singer; G Caridi; C Mendelsohn; L Sussel; VE Papaioannou; AG Gharavi. A retrotransposon insertion in the 5' regulatory domain of Ptf1a results in ectopic gene expression and multiple congenital defects in Danforth's short tail mouse. PLOS Genetics (2013)
- TS Macfarlan; WD Gifford; S Driscoll; K Lettieri; HM Rowe; D Bonanomi; A Firth; O Singer; D Trono; SL Pfaff. Embryonic stem cell potency fluctuates with endogenous retrovirus activity. Nature (2012)
- IA Maksakova; MT Romanish; L Gagnier; CA Dunn; LN van de Lagemaat; DL Mager. Retroviral elements and their hosts: insertional mutagenesis in the mouse germ line. PLOS Genetics (2006)
- T Matsui; D Leung; H Miyashita; IA Maksakova; H Miyachi; H Kimura; M Tachibana; MC Lorincz; Y Shinkai. Proviral silencing in embryonic stem cells requires the histone methyltransferase ESET. Nature (2010)
- HS Najafabadi; S Mnaimneh; FW Schmitges; M Garton; KN Lam; A Yang; M Albu; MT Weirauch; E Radovani; PM Kim; J Greenblatt; BJ Frey; TR Hughes. C2H2 zinc finger proteins greatly expand the human regulatory lexicon. Nature Biotechnology (2015)
- C Nellåker; TM Keane; B Yalcin; K Wong; A Agam; TG Belgard; J Flint; DJ Adams; WN Frankel; CP Ponting. The genomic landscape shaped by selection on transposable elements across 18 mouse strains. Genome Biology (2012)
- H O'Geen; S Frietze; PJ Farnham. Using ChIP-seq technology to identify targets of zinc finger transcription factors. Methods in Molecular Biology (2010)
- A Patel; P Yang; M Tinkham; M Pradhan; M-A Sun; Y Wang; D Hoang; G Wolf; JR Horton; X Zhang; T Macfarlan; X Cheng. DNA conformation induces adaptable binding by tandem zinc finger proteins. Cell (2018)
- D Ribet; M Dewannieux; T Heidmann. An active murine transposon family pair: retrotransposition of "master" MusD copies and ETn trans-mobilization. Genome Research (2004)
- SR Richardson; P Gerdes; DJ Gerhardt; FJ Sanchez-Luque; GO Bodea; M Muñoz-Lopez; JS Jesuadian; MHC Kempen; PE Carreira; JA Jeddeloh; JL Garcia-Perez; HH Kazazian; AD Ewing; GJ Faulkner. Heritable L1 retrotransposition in the mouse primordial germline and early embryo. Genome Research (2017)
- HM Rowe; J Jakobsson; D Mesnard; J Rougemont; S Reynard; T Aktas; PV Maillard; H Layard-Liesching; S Verp; J Marquis; F Spitz; DB Constam; D Trono. KAP1 controls endogenous retroviruses in embryonic stem cells. Nature (2010)
- HM Rowe; A Kapopoulou; A Corsinotti; L Fasching; TS Macfarlan; Y Tarabay; S Viville; J Jakobsson; SL Pfaff; D Trono. TRIM28 repression of retrotransposon-based enhancers is necessary to preserve transcriptional dynamics in embryonic stem cells. Genome Research (2013)
- SN Schauer; PE Carreira; R Shukla; DJ Gerhardt; P Gerdes; FJ Sanchez-Luque; P Nicoli; M Kindlova; S Ghisletti; AD Santos; D Rapoud; D Samuel; J Faivre; AD Ewing; SR Richardson; GJ Faulkner. L1 retrotransposition is a common feature of mammalian hepatocarcinogenesis. Genome Research (2018)
- DC Schultz; K Ayyanathan; D Negorev; GG Maul; FJ Rauscher. SETDB1: a novel KAP-1-associated histone H3, lysine 9-specific methyltransferase that contributes to HP1-mediated silencing of euchromatic genes by KRAB zinc-finger proteins. Genes & Development (2002)
- K Semba; K Araki; K Matsumoto; H Suda; T Ando; A Sei; H Mizuta; K Takagi; M Nakahara; M Muta; G Yamada; N Nakagata; A Iida; S Ikegawa; Y Nakamura; M Araki; K Abe; K Yamamura. Ectopic expression of Ptf1a induces spinal defects, urogenital defects, and anorectal malformations in Danforth's short tail mice. PLOS Genetics (2013)
- SP Sripathy; J Stevens; DC Schultz. The KAP1 corepressor functions to coordinate the assembly of de novo HP1-demarcated microenvironments of heterochromatin required for KRAB zinc finger protein-mediated transcriptional repression. Molecular and Cellular Biology (2006)
- JH Thomas; S Schneider. Coevolution of retroelements and tandem zinc finger genes. Genome Research (2011)
- PJ Thompson; TS Macfarlan; MC Lorincz. Long terminal repeats: from parasitic elements to building blocks of the transcriptional regulatory repertoire. Molecular Cell (2016)
- RS Treger; SD Pope; Y Kong; M Tokuyama; M Taura; A Iwasaki. The lupus susceptibility locus Sgp3 encodes the suppressor of endogenous retrovirus expression SNERV. Immunity (2019)
- CN Vlangos; AN Siuniak; D Robinson; AM Chinnaiyan; RH Lyons; JD Cavalcoli; CE Keegan. Next-generation sequencing identifies the Danforth's short tail mouse mutation as a retrotransposon insertion affecting Ptf1a expression. PLOS Genetics (2013)
- J Wang; G Xie; M Singh; AT Ghanbarian; T Raskó; A Szvetnik; H Cai; D Besser; A Prigione; NV Fuchs; GG Schumann; W Chen; MC Lorincz; Z Ivics; LD Hurst; Z Izsvák. Primate-specific endogenous retrovirus-driven transcription defines naive-like stem cells. Nature (2014)
- D Wolf; K Hug; SP Goff. TRIM28 mediates primer binding site-targeted silencing of Lys1,2 tRNA-utilizing retroviruses in embryonic cells. PNAS (2008)
- G Wolf; D Greenberg; TS Macfarlan. Spotting the enemy within: targeted silencing of foreign DNA in mammalian genomes by the Krüppel-associated box zinc finger protein family. Mobile DNA (2015a)
- G Wolf; P Yang; AC Füchtbauer; EM Füchtbauer; AM Silva; C Park; W Wu; AL Nielsen; FS Pedersen; TS Macfarlan. The KRAB zinc finger protein ZFP809 is required to initiate epigenetic silencing of endogenous retroviruses. Genes & Development (2015b)
- M Yamauchi; B Freitag; C Khan; B Berwin; E Barklis. Stem cell factor binding to retrovirus primer binding site silencers. Journal of Virology (1995)
- Y Zhang; T Liu; CA Meyer; J Eeckhoute; DS Johnson; BE Bernstein; C Nusbaum; RM Myers; M Brown; W Li; XS Liu. Model-based analysis of ChIP-Seq (MACS). Genome Biology (2008)

View File

@ -0,0 +1,132 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Risk factors associated with fai ... s: Results of a multi-country analysis
item-2 at level 2: paragraph: Burgert-Brucker Clara R.; 1: Glo ... shington, DC, United States of America
item-3 at level 2: section_header: Abstract
item-4 at level 3: text: Achieving elimination of lymphat ... ine prevalence and/or lower elevation.
item-5 at level 2: section_header: Introduction
item-6 at level 3: text: Lymphatic filariasis (LF), a dis ... 8 countries remain endemic for LF [3].
item-7 at level 3: text: The road to elimination as a pub ... t elimination be officially validated.
item-8 at level 3: text: Pre-TAS include at least one sen ... me of day that blood can be taken [5].
item-9 at level 3: text: When a country fails to meet the ... o ensure rounds of MDA are not missed.
item-10 at level 3: text: This study aims to understand wh ... e of limited LF elimination resources.
item-11 at level 2: section_header: Methods
item-12 at level 3: text: This is a secondary data analysi ... rch; no ethical approval was required.
item-13 at level 3: text: Building on previous work, we de ... available global geospatial data sets.
item-14 at level 3: section_header: Data sources
item-15 at level 4: text: Information on baseline prevalen ... publicly available sources (Table 1).
item-16 at level 3: section_header: Outcome and covariate variables
item-17 at level 4: text: The outcome of interest for this ... r than or equal to 1% Mf or 2% Ag [4].
item-18 at level 4: text: Potential covariates were derive ... is and the final categorizations used.
item-19 at level 4: section_header: Baseline prevalence
item-20 at level 5: text: Baseline prevalence can be assum ... (2) using the cut-off of <10% or ≥10%.
item-21 at level 4: section_header: Agent
item-22 at level 5: text: In terms of differences in trans ... dazole (DEC-ALB)] from the MDA domain.
item-23 at level 4: section_header: Environment
item-24 at level 5: text: LF transmission intensity is inf ... dicates a higher level of “greenness.”
item-25 at level 5: text: We included the socio-economic v ... proxy for socio-economic status [33].
item-26 at level 5: text: Finally, all or parts of distric ... s were co-endemic with onchocerciasis.
item-27 at level 4: section_header: MDA
item-28 at level 5: text: Treatment effectiveness depends ... esent a threat to elimination [41,42].
item-29 at level 5: text: We considered three approaches w ... unds ever documented in that district.
item-30 at level 4: section_header: Pre-TAS implementation
item-31 at level 5: text: Pre-TAS results can be influence ... d throughout the time period of study.
item-32 at level 3: section_header: Data inclusion criteria
item-33 at level 4: text: The dataset, summarized at the d ... al analysis dataset had 554 districts.
item-34 at level 3: section_header: Statistical analysis and modeling
item-35 at level 4: text: Statistical analysis and modelin ... d the number of variables accordingly.
item-36 at level 4: text: Sensitivity analysis was perform ... ot have been truly LF-endemic [43,44].
item-37 at level 2: section_header: Results
item-38 at level 3: text: The overall pre-TAS pass rate fo ... ts had baseline prevalences below 20%.
item-39 at level 3: text: Fig 3 shows the unadjusted analy ... overage, and sufficient rounds of MDA.
item-40 at level 3: text: The final log-binomial model inc ... igh baseline and diagnostic test used.
item-41 at level 3: text: Fig 4 shows the risk ratio resul ... of failing pre-TAS (95% CI 1.954.83).
item-42 at level 3: text: Sensitivity analyses were conduc ... gnified by large confidence intervals.
item-43 at level 3: text: Overall 74 districts in the data ... or 51% of all the failures (38 of 74).
item-44 at level 2: section_header: Discussion
item-45 at level 3: text: This paper reports for the first ... ctors associated with TAS failure [7].
item-46 at level 3: text: Though diagnostic test used was ... FTS was more sensitive than ICT [45].
item-47 at level 3: text: Elevation was the only environme ... ich impact vector chances of survival.
item-48 at level 3: text: The small number of failures ove ... search has shown the opposite [15,16].
item-49 at level 3: text: All other variables included in ... are not necessary to lower prevalence.
item-50 at level 3: text: Limitations to this study includ ... reducing LF prevalence [41,48,5153].
item-51 at level 3: text: Fourteen districts were excluded ... ta to extreme outliners in a district.
item-52 at level 3: text: As this analysis used data acros ... of individuals included in the survey.
item-53 at level 3: text: This paper provides evidence fro ... th high baseline and/or low elevation.
item-54 at level 2: section_header: Tables
item-55 at level 3: table with [18x8]
item-55 at level 4: caption: Table 1: Categorization of potential factors influencing pre-TAS results.
item-56 at level 3: table with [11x6]
item-56 at level 4: caption: Table 2: Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
item-57 at level 2: section_header: Figures
item-58 at level 3: picture
item-58 at level 4: caption: Fig 1: Number of pre-TAS by country.
item-59 at level 3: picture
item-59 at level 4: caption: Fig 2: District-level baseline prevalence by country.
item-60 at level 3: picture
item-60 at level 4: caption: Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
item-61 at level 3: picture
item-61 at level 4: caption: Fig 4: Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
item-62 at level 3: picture
item-62 at level 4: caption: Fig 5: Analysis of failures by model combinations.
item-63 at level 2: section_header: References
item-64 at level 3: list: group list
item-65 at level 4: list_item: World Health Organization. Lymph ... rategic plan 20102020. Geneva; 2010.
item-66 at level 4: list_item: World Health Organization. Valid ... public health problem. Geneva; 2017.
item-67 at level 4: list_item: Global programme to eliminate ly ... eport, 2018. Wkly Epidemiol Rec (2019)
item-68 at level 4: list_item: World Health Organization. Globa ... ss drug administration. Geneva; 2011.
item-69 at level 4: list_item: World Health Organization. Stren ... isease-specific Indicators. 2016; 42.
item-70 at level 4: list_item: Kyelem D; Biswas G; Bockarie MJ; ... search needs. Am J Trop Med Hyg (2008)
item-71 at level 4: list_item: Goldberg EM; King JD; Mupfasoni ... c filariasis. Am J Trop Med Hyg (2019)
item-72 at level 4: list_item: Cano J; Rebollo MP; Golding N; P ... present. Parasites and Vectors (2014)
item-73 at level 4: list_item: CGIAR-CSI. CGIAR-CSI SRTM 90m DEM Digital Elevation Database. In: .
item-74 at level 4: list_item: USGS NASA. Vegetation indices 16 ... et]. [cited 1 May 2018]. Available: .
item-75 at level 4: list_item: Funk C; Peterson P; Landsfeld M; ... r monitoring extremes. Sci Data (2015)
item-76 at level 4: list_item: Lloyd CT; Sorichetta A; Tatem AJ ... in population studies. Sci Data (2017)
item-77 at level 4: list_item: Elvidge CD; Baugh KE; Zhizhin M; ... hts. Proc Asia-Pacific Adv Netw (2013)
item-78 at level 4: list_item: Jambulingam P; Subramanian S; De ... dicators. Parasites and Vectors (2016)
item-79 at level 4: list_item: Michael E; Malecela-Lazaro MN; S ... c filariasis. Lancet Infect Dis (2004)
item-80 at level 4: list_item: Stolk WA; Swaminathan S; van Oor ... simulation study. J Infect Dis (2003)
item-81 at level 4: list_item: Grady CA; De Rochars MB; Direny ... asis programs. Emerg Infect Dis (2007)
item-82 at level 4: list_item: Evans D; McFarland D; Adamani W; ... Nigeria. Ann Trop Med Parasitol (2011)
item-83 at level 4: list_item: Richards FO; Eigege A; Miri ES; ... in Nigeria. PLoS Negl Trop Dis (2011)
item-84 at level 4: list_item: Biritwum NK; Yikpotey P; Marfo B ... Ghana. Trans R Soc Trop Med Hyg (2016)
item-85 at level 4: list_item: Moraga P; Cano J; Baggaley RF; G ... odelling. Parasites and Vectors (2015)
item-86 at level 4: list_item: Irvine MA; Njenga SM; Gunawarden ... ction. Trans R Soc Trop Med Hyg (2016)
item-87 at level 4: list_item: Ottesen EA. Efficacy of diethylc ... ariae in humans. Rev Infect Dis (1985)
item-88 at level 4: list_item: Gambhir M; Bockarie M; Tisch D; ... lymphatic filariasis. BMC Biol (2010)
item-89 at level 4: list_item: World Health Organization. Globa ... al entomology handbook. Geneva; 2013.
item-90 at level 4: list_item: Slater H; Michael E. Predicting ... gical niche modelling. PLoS One (2012)
item-91 at level 4: list_item: Slater H; Michael E. Mapping, Ba ... prevalence in Africa. PLoS One (2013)
item-92 at level 4: list_item: Sabesan S; Raju KHK; Subramanian ... odel. Vector-Borne Zoonotic Dis (2013)
item-93 at level 4: list_item: Stanton MC; Molyneux DH; Kyelem ... in Burkina Faso. Geospat Health (2013)
item-94 at level 4: list_item: Manhenje I; Teresa Galán-Puchade ... hern Mozambique. Geospat Health (2013)
item-95 at level 4: list_item: Ngwira BM; Tambala P; Perez a M; ... infection in Malawi. Filaria J (2007)
item-96 at level 4: list_item: Simonsen PE; Mwakitalu ME. Urban ... hatic filariasis. Parasitol Res (2013)
item-97 at level 4: list_item: Proville J; Zavala-Araiza D; Wag ... socio-economic trends. PLoS One (2017)
item-98 at level 4: list_item: Endeshaw T; Taye A; Tadesse Z; K ... st Ethiopia. Pathog Glob Health (2015)
item-99 at level 4: list_item: Richards FO; Eigege A; Pam D; Ka ... eas of co-endemicity. Filaria J (2005)
item-100 at level 4: list_item: Kyelem D; Sanou S; Boatin B a; M ... cations. Ann Trop Med Parasitol (2003)
item-101 at level 4: list_item: Weil GJ; Lammie PJ; Richards FO; ... ne and ivermectin. J Infect Dis (1991)
item-102 at level 4: list_item: Kumar A; Sachan P. Measuring imp ... rug administration. Trop Biomed (2014)
item-103 at level 4: list_item: Njenga SM; Mwandawiro CS; Wamae ... control. Parasites and Vectors (2011)
item-104 at level 4: list_item: Boyd A; Won KY; McClintock SK; D ... gane, Haiti. PLoS Negl Trop Dis (2010)
item-105 at level 4: list_item: Irvine MA; Reimer LJ; Njenga SM; ... mination. Parasites and Vectors (2015)
item-106 at level 4: list_item: Irvine MA; Stolk WA; Smith ME; S ... elling study. Lancet Infect Dis (2017)
item-107 at level 4: list_item: Pion SD; Montavon C; Chesnais CB ... crofilaremia. Am J Trop Med Hyg (2016)
item-108 at level 4: list_item: Wanji S; Esum ME; Njouendou AJ; ... in Cameroon. PLoS Negl Trop Dis (2018)
item-109 at level 4: list_item: Chesnais CB; Awaca-Uvon NP; Bola ... a in Africa. PLoS Negl Trop Dis (2017)
item-110 at level 4: list_item: Silumbwe A; Zulu JM; Halwindi H; ... haran Africa. BMC Public Health (2017)
item-111 at level 4: list_item: Adams AM; Vuckovic M; Birch E; B ... nistration. Trop Med Infect Dis (2018)
item-112 at level 4: list_item: Rao RU; Samarasekera SD; Nagodav ... n Sri Lanka. PLoS Negl Trop Dis (2017)
item-113 at level 4: list_item: Xu Z; Graves PM; Lau CL; Clement ... is in American Samoa. Epidemics (2018)
item-114 at level 4: list_item: Id CM; Tettevi EJ; Mechan F; Idu ... rural Ghana. PLoS Negl Trop Dis (2019)
item-115 at level 4: list_item: Eigege A; Kal A; Miri E; Sallau ... in Nigeria. PLoS Negl Trop Dis (2013)
item-116 at level 4: list_item: Van den Berg H; Kelly-Hope LA; L ... r management. Lancet Infect Dis (2013)
item-117 at level 4: list_item: Webber R.. Eradication of Wucher ... ntrol. Trans R Soc Trop Med Hyg (1979)
item-118 at level 1: caption: Table 1: Categorization of potential factors influencing pre-TAS results.
item-119 at level 1: caption: Table 2: Adjusted risk ratios fo ... g-binomial model sensitivity analysis.
item-120 at level 1: caption: Fig 1: Number of pre-TAS by country.
item-121 at level 1: caption: Fig 2: District-level baseline prevalence by country.
item-122 at level 1: caption: Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
item-123 at level 1: caption: Fig 4: Adjusted risk ratios for ... ence Interval from log-binomial model.
item-124 at level 1: caption: Fig 5: Analysis of failures by model combinations.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,222 @@
# Risk factors associated with failing pre-transmission assessment surveys (pre-TAS) in lymphatic filariasis elimination programs: Results of a multi-country analysis
Burgert-Brucker Clara R.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Zoerhoff Kathryn L.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Headland Maureen; 1: Global Health Division, RTI International, Washington, DC, United States of America, 2: Global Health, Population, and Nutrition, FHI 360, Washington, DC, United States of America; Shoemaker Erica A.; 1: Global Health Division, RTI International, Washington, DC, United States of America; Stelmach Rachel; 1: Global Health Division, RTI International, Washington, DC, United States of America; Karim Mohammad Jahirul; 3: Department of Disease Control, Ministry of Health and Family Welfare, Dhaka, Bangladesh; Batcho Wilfrid; 4: National Control Program of Communicable Diseases, Ministry of Health, Cotonou, Benin; Bougouma Clarisse; 5: Lymphatic Filariasis Elimination Program, Ministère de la Santé, Ouagadougou, Burkina Faso; Bougma Roland; 5: Lymphatic Filariasis Elimination Program, Ministère de la Santé, Ouagadougou, Burkina Faso; Benjamin Didier Biholong; 6: National Onchocerciasis and Lymphatic Filariasis Control Program, Ministry of Health, Yaounde, Cameroon; Georges Nko'Ayissi; 6: National Onchocerciasis and Lymphatic Filariasis Control Program, Ministry of Health, Yaounde, Cameroon; Marfo Benjamin; 7: Neglected Tropical Diseases Programme, Ghana Health Service, Accra, Ghana; Lemoine Jean Frantz; 8: Ministry of Health, Port-au-Prince, Haiti; Pangaribuan Helena Ullyartha; 9: National Institute Health Research & Development, Ministry of Health, Jakarta, Indonesia; Wijayanti Eksi; 9: National Institute Health Research & Development, Ministry of Health, Jakarta, Indonesia; Coulibaly Yaya Ibrahim; 10: Filariasis Unit, International Center of Excellence in Research, Faculty of Medicine and Odontostomatology, Bamako, Mali; Doumbia Salif Seriba; 10: Filariasis Unit, International Center of Excellence in Research, Faculty of Medicine and Odontostomatology, Bamako, Mali; Rimal Pradip; 11: Epidemiology and Disease Control Division, Department of Health Service, Kathmandu, Nepal; Salissou Adamou Bacthiri; 12: Programme Onchocercose et Filariose Lymphatique, Ministère de la Santé, Niamey, Niger; Bah Yukaba; 13: National Neglected Tropical Disease Program, Ministry of Health and Sanitation, Freetown, Sierra Leone; Mwingira Upendo; 14: Neglected Tropical Disease Control Programme, National Institute for Medical Research, Dar es Salaam, Tanzania; Nshala Andreas; 15: IMA World Health/Tanzania NTD Control Programme, Uppsala University, & TIBA Fellow, Dar es Salaam, Tanzania; Muheki Edridah; 16: Programme to Eliminate Lymphatic Filariasis, Ministry of Health, Kampala, Uganda; Shott Joseph; 17: Division of Neglected Tropical Diseases, Office of Infectious Diseases, Bureau for Global Health, USAID, Washington, DC, United States of America; Yevstigneyeva Violetta; 17: Division of Neglected Tropical Diseases, Office of Infectious Diseases, Bureau for Global Health, USAID, Washington, DC, United States of America; Ndayishimye Egide; 2: Global Health, Population, and Nutrition, FHI 360, Washington, DC, United States of America; Baker Margaret; 1: Global Health Division, RTI International, Washington, DC, United States of America; Kraemer John; 1: Global Health Division, RTI International, Washington, DC, United States of America, 18: Georgetown University, Washington, DC, United States of America; Brady Molly; 1: Global Health Division, RTI International, Washington, DC, United States of America
## Abstract
Achieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five effective rounds of mass drug administration (MDA) and demonstrating low prevalence in subsequent assessments. The first assessments recommended by the World Health Organization (WHO) are sentinel and spot-check sites—referred to as pre-transmission assessment surveys (pre-TAS)—in each implementation unit after MDA. If pre-TAS shows that prevalence in each site has been lowered to less than 1% microfilaremia or less than 2% antigenemia, the implementation unit conducts a TAS to determine whether MDA can be stopped. Failure to pass pre-TAS means that further rounds of MDA are required. This study aims to understand factors influencing pre-TAS results using existing programmatic data from 554 implementation units, of which 74 (13%) failed, in 13 countries. Secondary data analysis was completed using existing data from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. Additional covariate data were obtained from spatial raster data sets. Bivariate analysis and multilinear regression were performed to establish potential relationships between variables and the pre-TAS result. Higher baseline prevalence and lower elevation were significant in the regression model. Variables statistically significantly associated with failure (p-value ≤0.05) in the bivariate analyses included baseline prevalence at or above 5% or 10%, use of Filariasis Test Strips (FTS), primary vector of Culex, treatment with diethylcarbamazine-albendazole, higher elevation, higher population density, higher enhanced vegetation index (EVI), higher annual rainfall, and 6 or more rounds of MDA. This paper reports for the first time factors associated with pre-TAS results from a multi-country analysis. This information can help countries more effectively forecast program activities, such as the potential need for more rounds of MDA, and prioritize resources to ensure adequate coverage of all persons in areas at highest risk of failing pre-TAS.Author summaryAchieving elimination of lymphatic filariasis (LF) as a public health problem requires a minimum of five rounds of mass drug administration (MDA) and being able to demonstrate low prevalence in several subsequent assessments. LF elimination programs implement sentinel and spot-check site assessments, called pre-TAS, to determine whether districts are eligible to implement more rigorous population-based surveys to determine whether MDA can be stopped or if further rounds are required. Reasons for failing pre-TAS are not well understood and have not previously been examined with data compiled from multiple countries. For this analysis, we analyzed data from routine USAID and WHO reports from Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. In a model that included multiple variables, high baseline prevalence and lower elevation were significant. In models comparing only one variable to the outcome, the following were statistically significantly associated with failure: higher baseline prevalence at or above 5% or 10%, use of the FTS, primary vector of Culex, treatment with diethylcarbamazine-albendazole, lower elevation, higher population density, higher Enhanced Vegetation Index, higher annual rainfall, and six or more rounds of mass drug administration. These results can help national programs plan MDA more effectively, e.g., by focusing resources on areas with higher baseline prevalence and/or lower elevation.
## Introduction
Lymphatic filariasis (LF), a disease caused by parasitic worms transmitted to humans by mosquito bite, manifests in disabling and stigmatizing chronic conditions including lymphedema and hydrocele. To eliminate LF as a public health problem, the World Health Organization (WHO) recommends two strategies: reducing transmission through annual mass drug administration (MDA) and reducing suffering through ensuring the availability of morbidity management and disability prevention services to all patients [1]. For the first strategy, eliminating LF as a public health problem is defined as a reduction in measurable prevalence in infection in endemic areas below a target threshold at which further transmission is considered unlikely even in the absence of MDA [2]. As of 2018, 14 countries have eliminated LF as a public health problem while 58 countries remain endemic for LF [3].
The road to elimination as a public health problem has several milestones. First, where LF prevalence at baseline has exceeded 1% as measured either through microfilaremia (Mf) or antigenemia (Ag), MDA is implemented and treatment coverage is measured in all implementation units, which usually correspond to districts. Implementation units must complete at least five rounds of effective treatment, i.e. treatment with a minimum coverage of 65% of the total population. Then, WHO recommends sentinel and spot-check site assessments—referred to as pre-transmission assessment surveys (pre-TAS)—in each implementation unit to determine whether prevalence in each site is less than 1% Mf or less than 2% Ag [4]. Next, if these thresholds are met, national programs can progress to the first transmission assessment survey (TAS). The TAS is a population-based cluster or systematic survey of six- and seven-year-old children to assess whether transmission has fallen below the threshold at which infection is believed to persist. TAS is conducted at least three times, with two years between each survey. TAS 1 results determine if it is appropriate to stop MDA or whether further rounds are required. Finally, when TAS 2 and 3 also fall below the set threshold in every endemic implementation unit nationwide and morbidity criteria have been fulfilled, the national program submits a dossier to WHO requesting that elimination be officially validated.
Pre-TAS include at least one sentinel and one spot-check site per one million population. Sentinel sites are established at the start of the program in villages where LF prevalence was believed to be relatively high. Spot-check sites are villages not previously tested but purposively selected as potentially high-risk areas due to original high prevalence, low coverage during MDA, high vector density, or other factors [4]. At least six months after MDA implementation, data are collected from a convenience sample of at least 300 people over five years old in each site. Originally, Mf was recommended as the indicator of choice for pre-TAS, assessed by blood smears taken at the time of peak parasite periodicity [4]. WHO later recommended the use of circulating filarial antigen rapid diagnostic tests, BinaxNow immunochromatographic card tests (ICTs), and after 2016, Alere Filariasis Test Strips (FTS), because they are more sensitive, easier to implement, and more flexible about time of day that blood can be taken [5].
When a country fails to meet the established thresholds in a pre-TAS, they must implement at least two more rounds of MDA. National programs need to forecast areas that might fail pre-TAS and need repeated MDA, so that they can inform the community and district decision-makers of the implications of pre-TAS failure, including the need for continued MDA to lower prevalence effectively. In addition, financial and human resources must be made available for ordering drugs, distributing drugs, supervision and monitoring to implement the further MDA rounds. Ordering drugs and providing MDA budgets often need to be completed before the pre-TAS are implemented, so contingency planning and funding are important to ensure rounds of MDA are not missed.
This study aims to understand which factors are associated with the need for additional rounds of MDA as identified by pre-TAS results using programmatic data from 13 countries. The factors associated with failing pre-TAS are not well understood and have not previously been examined at a multi-country scale in the literature. We examine the association between pre-TAS failure and baseline prevalence, parasites, environmental factors, MDA implementation, and pre-TAS implementation. Understanding determinants of pre-TAS failure will help countries identify where elimination may be most difficult and prioritize the use of limited LF elimination resources.
## Methods
This is a secondary data analysis using existing data, collected for programmatic purposes. Data for this analysis come from 568 districts in 13 countries whose LF elimination programs were supported by the United States Agency for International Development (USAID) through the ENVISION project, led by RTI International, and the END in Africa and END in Asia projects, led by FHI 360. These countries are Bangladesh, Benin, Burkina Faso, Cameroon, Ghana, Haiti, Indonesia, Mali, Nepal, Niger, Sierra Leone, Tanzania, and Uganda. The data represent all pre-TAS funded by USAID from 2012 to 2017 and, in some cases, surveys funded by host government or other non-United States government funders. Because pre-TAS data were collected as part of routine program activities in most countries, in general, ethical clearance was not sought for these surveys. Our secondary analysis only included the aggregated survey results and therefore did not constitute human subjects research; no ethical approval was required.
Building on previous work, we delineated five domains of variables that could influence pre-TAS outcomes: prevalence, agent, environment, MDA, and pre-TAS implementation (Table 1) [68]. We prioritized key concepts that could be measured through our data or captured through publicly available global geospatial data sets.
### Data sources
Information on baseline prevalence, MDA coverage, the number of MDA rounds, and pre-TAS information (month and year of survey, district, site name, and outcome) was gathered through regular reporting for the USAID-funded NTD programs (ENVISION, END in Africa, and END in Asia). These data were augmented by other reporting data such as the countrys dossier data annexes, the WHO Preventive Chemotherapy and Transmission Control Databank, and WHO reporting forms. Data were then reviewed by country experts, including the Ministry of Health program staff and implementing program staff, and updated as necessary. Data on vectors were also obtained from country experts. The district geographic boundaries were matched to geospatial shapefiles from the ENVISION project geospatial data repository, while other geospatial data were obtained through publicly available sources (Table 1).
### Outcome and covariate variables
The outcome of interest for this analysis was whether a district passed or failed the pre-TAS. Failure was defined as any district that had at least one sentinel or spot-check site with a prevalence higher than or equal to 1% Mf or 2% Ag [4].
Potential covariates were derived from the available data for each factor in the domain groups listed in Table 1. New dichotomous variables were created for all variables that had multiple categories or were continuous for ease of interpretation in models and use in program decision-making. Cut-points for continuous variables were derived from either a priori knowledge or through exploratory analysis considering the mean or median value of the dataset, looking to create two groups of similar size with logical cut-points (e.g. rounding numbers to whole numbers). All the variables derived from publicly available global spatial raster datasets were summarized to the district level in ArcGIS Pro using the “zonal statistics” tool. The final output used the continuous value measuring the mean pixel value for the district for all variables except geographic area. Categories for each variable were determined by selecting the mean or median dataset value or cut-off used in other relevant literature [7]. The following section describes the variables that were included in the final analysis and the final categorizations used.
#### Baseline prevalence
Baseline prevalence can be assumed as a proxy for local transmission conditions [14] and correlates with prevalence after MDA [1420]. Baseline prevalence for each district was measured by either blood smears to measure Mf or rapid diagnostic tests to measure Ag. Other studies have modeled Mf and Ag prevalence separately, due to lack of a standardized correlation between the two, especially at pre-MDA levels [21,22]. However, because WHO mapping guidance states that MDA is required if either Mf or Ag is ≥1% and there were not enough data to model each separately, we combined baseline prevalence values regardless of diagnostic test used. We created two variables for use in the analysis (1) using the cut-off of <5% or 5% (dataset median value of 5%) and (2) using the cut-off of <10% or 10%.
#### Agent
In terms of differences in transmission dynamics by agent, research has shown that Brugia spp. are more susceptible to the anti-filarial drug regimens than Wuchereria bancrofti parasites [23]. Thus, we combined districts reporting B. malayi and B. timori and compared them to areas with W. bancrofti or mixed parasites. Two variables from other domains were identified in exploratory analyses to be highly colinear with the parasite, and thus we considered them in the same group of variables for the final regression models. These were variables delineating vectors (Anopheles or Mansonia compared to Culex) from the environmental domain and drug package [ivermectin-albendazole (IVM-ALB) compared to diethylcarbamazine-albendazole (DEC-ALB)] from the MDA domain.
#### Environment
LF transmission intensity is influenced by differing vector transmission dynamics, including vector biting rates and competence, and the number of individuals with microfilaria [21,24,25]. Since vector data are not always available, previous studies have explored whether environmental variables associated with vector density, such as elevation, rainfall, and temperature, can be used to predict LF prevalence [8,21,2631]. We included the district area and elevation in meters as geographic variables potentially associated with transmission intensity. In addition, within the climate factor, we included Enhanced Vegetation Index (EVI) and rainfall variables. EVI measures vegetation levels, or “greenness,” where a higher index value indicates a higher level of “greenness.”
We included the socio-economic variable of population density, as it has been positively associated with LF prevalence in some studies [8,27,29], but no significant association has been found in others [30]. Population density could be correlated with vector, as in eastern African countries LF is mostly transmitted by Culex in urban areas and by Anopheles in rural areas [32]. Additionally, inclusion of the satellite imagery of nighttime lights data is another a proxy for socio-economic status [33].
Finally, all or parts of districts that are co-endemic with onchocerciasis may have received multiple rounds of MDA with ivermectin before LF MDA started, which may have lowered LF prevalence in an area [3436]. Thus, we included a categorical variable to distinguish if districts were co-endemic with onchocerciasis.
#### MDA
Treatment effectiveness depends upon both drug efficacy (ability to kill adult worms, ability to kill Mf, drug resistance, drug quality) and implementation of MDA (coverage, compliance, number of rounds) [14,16]. Ivermectin is less effective against adult worms than DEC, and therefore it is likely that Ag reduction is slower in areas using ivermectin instead of DEC in MDA [37]. Models also have shown that MDA coverage affects prevalence, although coverage has been defined in various ways, such as median coverage, number of rounds, or individual compliance [1416,20,3840]. Furthermore, systematic non-compliance, or population sub-groups which consistently refuse to take medicines, has been shown to represent a threat to elimination [41,42].
We considered three approaches when analyzing the MDA data: median MDA coverage in the most recent 5 rounds, number of rounds with sufficient coverage in the most recent 5 rounds, and count of the total number of rounds. MDA coverage is considered sufficient at or above 65% of the total population who were reported to have ingested the drugs; this was used as the cut point for MDA median coverage for the most recent 5 rounds. The rounds of sufficient coverage variable was categorized as having 2 or fewer rounds compared to 3 or more sufficient rounds. The total number of MDA rounds variable was categorized at 5 or fewer rounds compared to 6 or more rounds ever documented in that district.
#### Pre-TAS implementation
Pre-TAS results can be influenced by the implementation of the survey itself, including the use of a particular diagnostic test, the selection of sites, the timing of survey, and the appropriate application of methods for population recruitment and diagnostic test adminstration. We included two variables in the pre-TAS implementation domain: `type of diagnostic method used and `diagnostic test used. The type of diagnostic method used variable categorized districts by either using Mf or Ag. The diagnostic test used variable examined Mf (reference category) compared to ICT and compared to FTS (categorical variable with 3 values). This approach was used to compare each test to each other. Countries switched from ICT to FTS during 2016, while Mf testing continued to be used throughout the time period of study.
### Data inclusion criteria
The dataset, summarized at the district level, included information from 568 districts where a pre-TAS was being implemented for the first time. A total of 14 districts were removed from the final analysis due to missing data related to the following points: geospatial boundaries (4), baseline prevalence (4), and MDA coverage (6). The final analysis dataset had 554 districts.
### Statistical analysis and modeling
Statistical analysis and modeling were done with Stata MP 15.1 (College Station, TX). Descriptive statistics comparing various variables to the principle outcome were performed. Significant differences were identified using a chi-square test. A generalized linear model (GLM) with a log link and binomial error distribution—which estimates relative risks—was developed using forward stepwise modeling methods (called log-binomial model). Models with higher pseudo-r-squared and lower Akaike information criterion (AIC) were retained at each step. Pseudo-r-squared is a value between 0 and 1 with the higher the value, the better the model is at predicting the outcome of interest. AIC values are used to compare the relative quality of models compared to each other; in general, a lower value indicates a better model. Variables were tested by factor group. Once a variable was selected from the group, no other variable in that same group was eligible to be included in the final model due to issues of collinearity and small sample sizes. Interaction between terms in the model was tested after model selection, and interaction terms that modified the original terms significance were included in the final model. Overall, the number of potential variables able to be included in the model remained low due to the relatively small number of failure results (13%) in the dataset. Furthermore, the models with more than 3 variables and one interaction term either were unstable (indicated by very large confidence interval widths) or did not improve the model by being significant predictors or by modifying other parameters already in the model. These models were at heightened risk of non-convergence; we limited the number of variables accordingly.
Sensitivity analysis was performed for the final log-binomial model to test for the validity of results under different parameters by excluding some sub-sets of districts from the dataset and rerunning the model. This analysis was done to understand the robustness of the model when (1) excluding all districts in Cameroon, (2) including only districts in Africa, (3) including only districts with W. bancrofti parasite, and (4) including only districts with Anopheles as the primary vector. The sensitivity analysis excluding Cameroon was done for two reasons. First, Cameroon had the most pre-TAS results included, but no failures. Second, 70% of the Cameroon districts included in the analysis are co-endemic for loiasis. Given that diagnostic tests used in LF mapping have since been shown to cross-react with loiasis, there is some concern that these districts might not have been truly LF-endemic [43,44].
## Results
The overall pre-TAS pass rate for the districts included in this analysis was 87% (74 failures in 554 districts). Nearly 40% of the 554 districts were from Cameroon (134) and Tanzania (87) (Fig 1). No districts in Bangladesh, Cameroon, Mali, or Uganda failed a pre-TAS in this data set; over 25% of districts in Burkina Faso, Ghana, Haiti, Nepal, and Sierra Leone failed pre-TAS in this data set. Baseline prevalence varied widely within and between the 13 countries. Fig 2 shows the highest, lowest, and median baseline prevalence in the study districts by country. Burkina Faso had the highest median baseline prevalence at 52% and Burkina Faso, Tanzania, and Ghana all had at least one district with a very high baseline of over 70%. In Mali, Indonesia, Benin, and Bangladesh, all districts had baseline prevalences below 20%.
Fig 3 shows the unadjusted analysis for key variables by pre-TAS result. Variables statistically significantly associated with failure (p-value ≤0.05) included higher baseline prevalence at or above 5% or 10%, FTS diagnostic test, primary vector of Culex, treatment with DEC-ALB, higher elevation, higher population density, higher EVI, higher annual rainfall, and six or more rounds of MDA. Variables that were not significantly associated with pre-TAS failure included diagnostic method used (Ag or Mf), parasite, co-endemicity for onchocerciasis, median MDA coverage, and sufficient rounds of MDA.
The final log-binomial model included the variables of baseline prevalence ≥10%, the diagnostic test used (FTS and ICT), and elevation. The final model also included a significant interaction term between high baseline and diagnostic test used.
Fig 4 shows the risk ratio results with their corresponding confidence intervals. In a model with interaction between baseline and diagnostic test the baseline parameter was significant while diagnostic test and the interaction term were not. Districts with high baseline had a statistically significant (p-value ≤0.05) 2.52 times higher risk of failure (95% CI 1.374.64) compared to those with low baseline prevalence. The FTS diagnostic test or ICT diagnostic test alone were not significant nor was the interaction term. Additionally, districts with an elevation below 350 meters had a statistically significant (p-value ≤0.05) 3.07 times higher risk of failing pre-TAS (95% CI 1.954.83).
Sensitivity analyses were conducted using the same model with different subsets of the dataset including (1) all districts except for districts in Cameroon (134 total with no failures), (2) only districts in Africa, (3) only districts with W. bancrofti, and (4) only districts with Anopheles as primary vector. The results of the sensitivity models (Table 2) indicate an overall robust model. High baseline and lower elevation remained significant across all the models. The ICT diagnostic test used remains insignificant across all models. The FTS diagnostic test was positively significant in model 1 and negatively significant in model 4. The interaction term of baseline prevalence and FTS diagnostic test was significant in three models though the estimate was unstable in the W. bancrofti-only and Anopheles-only models (models 3 and 4 respectively), as signified by large confidence intervals.
Overall 74 districts in the dataset failed pre-TAS. Fig 5 summarizes the likelihood of failure by variable combinations identified in the log-binomial model. For those districts with a baseline prevalence ≥10% that used a FTS diagnostic test and have an average elevation below 350 meters (Combination C01), 87% of the 23 districts failed. Of districts with high baseline that used an ICT diagnostic test and have a low average elevation (C02) 45% failed. Overall, combinations with high baseline and low elevation C01, C02, and C04 accounted for 51% of all the failures (38 of 74).
## Discussion
This paper reports for the first time factors associated with pre-TAS results from a multi-country analysis. Variables significantly associated with failure were higher baseline prevalence and lower elevation. Districts with a baseline prevalence of 10% or more were at 2.52 times higher risk to fail pre-TAS in the final log-binomial model. In the bivariate analysis, baseline prevalence above 5% was also significantly more likely to fail compared to lower baselines, which indicates that the threshold for higher baseline prevalence may be as little as 5%, similar to what was found in Goldberg et al., which explored ecological and socioeconomic factors associated with TAS failure [7].
Though diagnostic test used was selected for the final log-binomial model, neither category (FTS or ICT) were significant after interaction with high baseline. FTS alone is significant in the bivariate analysis compared to ICT or Mf. This result is not surprising given previous research which found that FTS was more sensitive than ICT [45].
Elevation was the only environmental domain variable selected for the final log-binomial model during the model selection process, with areas of lower elevation (<350m) found to be at 3.07 times higher risk to fail pre-TAS compared to districts with a higher elevation. Similar results related to elevation were found in previous studies [8,31], including Goldberg et al. [7], who used a cutoff of 200 meters. Elevation likely also encompasses some related environmental concepts, such as vector habitat, greenness (EVI), or rainfall, which impact vector chances of survival.
The small number of failures overall prevented the inclusion of a large number of variables in the final log-binomial model. However, other variables that are associated with failure as identified in the bivariate analyses, such as Culex vector, higher population density, higher EVI, higher rainfall and more rounds of MDA, should not be discounted when making programmatic decisions. Other models have shown that Culex as the predominant vector in a district, compared to Anopheles, results in more intense interventions needed to reach elimination [24,41]. Higher population density, which was also found to predict TAS failure [7], could be related to different vector species transmission dynamics in urban areas, as well as the fact that MDAs are harder to conduct and to accurately measure in urban areas [46,47]. Both higher enhanced vegetation index (>0.3) and higher rainfall (>700 mm per year) contribute to expansion of vector habitats and population. Additionally, having more than five rounds of MDA before pre-TAS was also statistically significantly associated with higher failure in the bivariate analysis. It is unclear why higher number of rounds is associated with first pre-TAS failure given that other research has shown the opposite [15,16].
All other variables included in this analysis were not significantly associated with pre-TAS failure in our analysis. Goldberg et al. found Brugia spp. to be significantly associated with failure, but our results did not. This is likely due in part to the small number of districts with Brugia spp. in our dataset (6%) compared to 46% in the Goldberg et al. article [7]. MDA coverage levels were not significantly associated with pre-TAS failure, likely due to the lack of variance in the coverage data since WHO guidance dictates a minimum of five rounds of MDA with ≥65% epidemiological coverage to be eligible to implement pre-TAS. It should not be interpreted as evidence that high MDA coverage levels are not necessary to lower prevalence.
Limitations to this study include data sources, excluded data, unreported data, misassigned data, and aggregation of results at the district level. The main data sources for this analysis were programmatic data, which may be less accurate than data collected specifically for research purposes. This is particularly true of the MDA coverage data, where some countries report data quality challenges in areas of instability or frequent population migration. Even though risk factors such as age, sex, compliance with MDA, and use of bednets have been shown to influence infection in individuals [40,4850], we could not include factors from the human host domain in our analysis, as data sets were aggregated at site level and did not include individual information. In addition, vector control data were not universally available across the 13 countries and thus were not included in the analysis, despite studies showing that vector control has an impact on reducing LF prevalence [41,48,5153].
Fourteen districts were excluded from the analysis because we were not able to obtain complete data for baseline prevalence, MDA coverage, or geographic boundaries. One of these districts had failed pre-TAS. It is likely these exclusions had minimal impact on the conclusions, as they represented a small number of districts and were similar to other included districts in terms of key variables. Unreported data could have occurred if a country conducted a pre-TAS that failed and then chose not to report it or reported it as a mid-term survey instead. Anecdotally, we know this has occurred occasionally, but we do not believe the practice to be widespread. Another limitation in the analysis is a potential misassignment of key variable values to a district due to changes in the district over time. Redistricting, changes in district size or composition, was pervasive in many countries during the study period; however, we expect the impact on the study outcome to be minimal, as the historical prevalence and MDA data from the “mother” districts are usually flowed down to these new “daughter” districts. However, it is possible that the split created an area of higher prevalence or lower MDA coverage than would have been found on average in the overall larger original “mother” district. Finally, the aggregation or averaging of results to the district level may mask heterogeneity within districts. Though this impact could be substantial in districts with considerable heterogeneity, the use of median values and binomial variables mitigated the likelihood of skewing the data to extreme outliners in a district.
As this analysis used data across a variety of countries and epidemiological situations, the results are likely relevant for other districts in the countries examined and in countries with similar epidemiological backgrounds. In general, as more data become available at site level through the increased use of electronic data collection tools, further analysis of geospatial variables and associations will be possible. For example, with the availability of GPS coordinates, it may become possible to analyze outcomes by site and to link the geospatial environmental domain variables at a smaller scale. Future analyses also might seek to include information from coverage surveys or qualitative research studies on vector control interventions such as bed net usage, MDA compliance, population movement, and sub-populations that might be missed during MDA. Future pre-TAS using electronic data collection could include sex and age of individuals included in the survey.
This paper provides evidence from analysis of 554 districts and 13 countries on the factors associated with pre-TAS results. Baseline prevalence, elevation, vector, population density, EVI, rainfall, and number of MDA rounds were all significant in either bivariate or multivariate analyses. This information along with knowledge of local context can help countries more effectively plan pre-TAS and forecast program activities, such as the potential need for more than five rounds of MDA in areas with high baseline and/or low elevation.
## Tables
Table 1: Categorization of potential factors influencing pre-TAS results.
| Domain | Factor | Covariate | Description | Reference Group | Summary statistic | Temporal Resolution | Source |
|------------------------|-----------------------|-------------------------------|-----------------------------------------------------------------|----------------------|---------------------|-----------------------|--------------------|
| Prevalence | Baseline prevalence | 5% cut off | Maximum reported mapping or baseline sentinel site prevalence | <5% | Maximum | Varies | Programmatic data |
| Prevalence | Baseline prevalence | 10% cut off | Maximum reported mapping or baseline sentinel site prevalence | <10% | Maximum | Varies | Programmatic data |
| Agent | Parasite | Parasite | Predominate parasite in district | W. bancrofti & mixed | Binary value | 2018 | Programmatic data |
| Environment | Vector | Vector | Predominate vector in district | Anopheles & Mansonia | Binary value | 2018 | Country expert |
| Environment | Geography | Elevation | Elevation measured in meters | >350 | Mean | 2000 | CGIAR-CSI SRTM [9] |
| Environment | Geography | District area | Area measured in km2 | >2,500 | Maximum sum | Static | Programmatic data |
| Environment | Climate | EVI | Enhanced vegetation index | > 0.3 | Mean | 2015 | MODIS [10] |
| Environment | Climate | Rainfall | Annual rainfall measured in mm | ≤ 700 | Mean | 2015 | CHIRPS [11] |
| Environment | Socio-economic | Population density | Number of people per km2 | ≤ 100 | Mean | 2015 | WorldPop [12] |
| Environment | Socio-economic | Nighttime lights | Nighttime light index from 0 to 63 | >1.5 | Mean | 2015 | VIIRS [13] |
| Environment | Co-endemicity | Co-endemic for onchocerciasis | Part or all of district is also endemic for onchocerciases | Non-endemic | Binary value | 2018 | Programmatic data |
| MDA | Drug efficacy | Drug package | DEC-ALB or IVM-ALB | DEC-ALB | Binary value | 2018 | Programmatic data |
| MDA | Implementation of MDA | Coverage | Median MDA coverage for last 5 rounds | ≥ 65% | Median | Varies | Programmatic data |
| MDA | Implementation of MDA | Sufficient rounds | Number of rounds of sufficient (≥ 65% coverage) in last 5 years | ≥ 3 | Count | Varies | Programmatic data |
| MDA | Implementation of MDA | Number of rounds | Maximum number of recorded rounds of MDA | ≥ 6 | Maximum | Varies | Programmatic data |
| Pre-TAS implementation | Quality of survey | Diagnostic method | Using Mf or Ag | Mf | Binary value | Varies | Programmatic data |
| Pre-TAS implementation | Quality of survey | Diagnostic test | Using Mf, ICT, or FTS | Mf | Categorical | Varies | Programmatic data |
Table 2: Adjusted risk ratios for pre-TAS failure from log-binomial model sensitivity analysis.
| | | (1) | (2) | (3) | (4) |
|---------------------------------------------|------------------|----------------------------|--------------------------|--------------------------------------|---------------------------------|
| | Full Model | Without Cameroon districts | Only districts in Africa | Only W. bancrofti parasite districts | Only Anopheles vector districts |
| Number of Failures | 74 | 74 | 44 | 72 | 46 |
| Number of total districts | (N = 554) | (N = 420) | (N = 407) | (N = 518) | (N = 414) |
| Covariate | RR (95% CI) | RR (95% CI) | RR (95% CI) | RR (95% CI) | RR (95% CI) |
| Baseline prevalence > = 10% & used FTS test | 2.38 (0.965.90) | 1.23 (0.522.92) | 14.52 (1.79117.82) | 2.61 (1.036.61) | 15.80 (1.95127.67) |
| Baseline prevalence > = 10% & used ICT test | 0.80 (0.203.24) | 0.42 (0.111.68) | 1.00 (0.000.00) | 0.88 (0.213.60) | 1.00 (0.000.00) |
| +Used FTS test | 1.16 (0.522.59) | 2.40 (1.125.11) | 0.15 (0.021.11) | 1.03 (0.452.36) | 0.13 (0.020.96) |
| +Used ICT test | 0.92 (0.322.67) | 1.47 (0.514.21) | 0.33 (0.042.54) | 0.82 (0.282.43) | 0.27 (0.032.04) |
| +Baseline prevalence > = 10% | 2.52 (1.374.64) | 2.42 (1.314.47) | 2.03 (1.063.90) | 2.30 (1.214.36) | 2.01 (1.073.77) |
| Elevation < 350m | 3.07 (1.954.83) | 2.21 (1.423.43) | 4.68 (2.229.87) | 3.04 (1.934.79) | 3.76 (1.927.37) |
## Figures
Fig 1: Number of pre-TAS by country.
<!-- image -->
Fig 2: District-level baseline prevalence by country.
<!-- image -->
Fig 3: Percent pre-TAS failure by each characteristic (unadjusted).
<!-- image -->
Fig 4: Adjusted risk ratios for pre-TAS failure with 95% Confidence Interval from log-binomial model.
<!-- image -->
Fig 5: Analysis of failures by model combinations.
<!-- image -->
## References
- World Health Organization. Lymphatic filariasis: progress report 20002009 and strategic plan 20102020. Geneva; 2010.
- World Health Organization. Validation of elimination of lymphatic filariasis as a public health problem. Geneva; 2017.
- Global programme to eliminate lymphatic filariasis: progress report, 2018. Wkly Epidemiol Rec (2019)
- World Health Organization. Global programme to eliminate lymphatic filariasis: monitoring and epidemiological assessment of mass drug administration. Geneva; 2011.
- World Health Organization. Strengthening the assessment of lymphatic filariasis transmission and documenting the achievement of elimination—Meeting of the Neglected Tropical Diseases Strategic and Technical Advisory Groups Monitoring and Evaluation Subgroup on Disease-specific Indicators. 2016; 42.
- Kyelem D; Biswas G; Bockarie MJ; Bradley MH; El-Setouhy M; Fischer PU. Determinants of success in national programs to eliminate lymphatic filariasis: a perspective identifying essential elements and research needs. Am J Trop Med Hyg (2008)
- Goldberg EM; King JD; Mupfasoni D; Kwong K; Hay SI; Pigott DM. Ecological and socioeconomic predictors of transmission assessment survey failure for lymphatic filariasis. Am J Trop Med Hyg (2019)
- Cano J; Rebollo MP; Golding N; Pullan RL; Crellen T; Soler A. The global distribution and transmission limits of lymphatic filariasis: past and present. Parasites and Vectors (2014)
- CGIAR-CSI. CGIAR-CSI SRTM 90m DEM Digital Elevation Database. In: .
- USGS NASA. Vegetation indices 16-DAy L3 global 500 MOD13A1 dataset [Internet]. [cited 1 May 2018]. Available: .
- Funk C; Peterson P; Landsfeld M; Pedreros D; Verdin J; Shukla S. The climate hazards infrared precipitation with stations—A new environmental record for monitoring extremes. Sci Data (2015)
- Lloyd CT; Sorichetta A; Tatem AJ. High resolution global gridded data for use in population studies. Sci Data (2017)
- Elvidge CD; Baugh KE; Zhizhin M; Hsu F-C. Why VIIRS data are superior to DMSP for mapping nighttime lights. Proc Asia-Pacific Adv Netw (2013)
- Jambulingam P; Subramanian S; De Vlas SJ; Vinubala C; Stolk WA. Mathematical modelling of lymphatic filariasis elimination programmes in India: required duration of mass drug administration and post-treatment level of infection indicators. Parasites and Vectors (2016)
- Michael E; Malecela-Lazaro MN; Simonsen PE; Pedersen EM; Barker G; Kumar A. Mathematical modelling and the control of lymphatic filariasis. Lancet Infect Dis (2004)
- Stolk WA; Swaminathan S; van Oortmarssen GJ; Das PK; Habbema JDF. Prospects for elimination of bancroftian filariasis by mass drug treatment in Pondicherry, India: a simulation study. J Infect Dis (2003)
- Grady CA; De Rochars MB; Direny AN; Orelus JN; Wendt J; Radday J. Endpoints for lymphatic filariasis programs. Emerg Infect Dis (2007)
- Evans D; McFarland D; Adamani W; Eigege A; Miri E; Schulz J. Cost-effectiveness of triple drug administration (TDA) with praziquantel, ivermectin and albendazole for the prevention of neglected tropical diseases in Nigeria. Ann Trop Med Parasitol (2011)
- Richards FO; Eigege A; Miri ES; Kal A; Umaru J; Pam D. Epidemiological and entomological evaluations after six years or more of mass drug administration for lymphatic filariasis elimination in Nigeria. PLoS Negl Trop Dis (2011)
- Biritwum NK; Yikpotey P; Marfo BK; Odoom S; Mensah EO; Asiedu O. Persistent “hotspots” of lymphatic filariasis microfilaraemia despite 14 years of mass drug administration in Ghana. Trans R Soc Trop Med Hyg (2016)
- Moraga P; Cano J; Baggaley RF; Gyapong JO; Njenga SM; Nikolay B. Modelling the distribution and transmission intensity of lymphatic filariasis in sub-Saharan Africa prior to scaling up interventions: integrated use of geostatistical and mathematical modelling. Parasites and Vectors (2015)
- Irvine MA; Njenga SM; Gunawardena S; Wamae CN; Cano J; Brooker SJ. Understanding the relationship between prevalence of microfilariae and antigenaemia using a model of lymphatic filariasis infection. Trans R Soc Trop Med Hyg (2016)
- Ottesen EA. Efficacy of diethylcarbamazine in eradicating infection with lymphatic-dwelling filariae in humans. Rev Infect Dis (1985)
- Gambhir M; Bockarie M; Tisch D; Kazura J; Remais J; Spear R. Geographic and ecologic heterogeneity in elimination thresholds for the major vector-borne helminthic disease, lymphatic filariasis. BMC Biol (2010)
- World Health Organization. Global programme to eliminate lymphatic filariasis: practical entomology handbook. Geneva; 2013.
- Slater H; Michael E. Predicting the current and future potential distributions of lymphatic filariasis in Africa using maximum entropy ecological niche modelling. PLoS One (2012)
- Slater H; Michael E. Mapping, Bayesian geostatistical analysis and spatial prediction of lymphatic filariasis prevalence in Africa. PLoS One (2013)
- Sabesan S; Raju KHK; Subramanian S; Srivastava PK; Jambulingam P. Lymphatic filariasis transmission risk map of India, based on a geo-environmental risk model. Vector-Borne Zoonotic Dis (2013)
- Stanton MC; Molyneux DH; Kyelem D; Bougma RW; Koudou BG; Kelly-Hope LA. Baseline drivers of lymphatic filariasis in Burkina Faso. Geospat Health (2013)
- Manhenje I; Teresa Galán-Puchades M; Fuentes M V. Socio-environmental variables and transmission risk of lymphatic filariasis in central and northern Mozambique. Geospat Health (2013)
- Ngwira BM; Tambala P; Perez a M; Bowie C; Molyneux DH. The geographical distribution of lymphatic filariasis infection in Malawi. Filaria J (2007)
- Simonsen PE; Mwakitalu ME. Urban lymphatic filariasis. Parasitol Res (2013)
- Proville J; Zavala-Araiza D; Wagner G. Night-time lights: a global, long term look at links to socio-economic trends. PLoS One (2017)
- Endeshaw T; Taye A; Tadesse Z; Katabarwa MN; Shafi O; Seid T. Presence of Wuchereria bancrofti microfilaremia despite seven years of annual ivermectin monotherapy mass drug administration for onchocerciasis control: a study in north-west Ethiopia. Pathog Glob Health (2015)
- Richards FO; Eigege A; Pam D; Kal A; Lenhart A; Oneyka JOA. Mass ivermectin treatment for onchocerciasis: lack of evidence for collateral impact on transmission of Wuchereria bancrofti in areas of co-endemicity. Filaria J (2005)
- Kyelem D; Sanou S; Boatin B a; Medlock J; Couibaly S; Molyneux DH. Impact of long-term ivermectin (Mectizan) on Wuchereria bancrofti and Mansonella perstans infections in Burkina Faso: strategic and policy implications. Ann Trop Med Parasitol (2003)
- Weil GJ; Lammie PJ; Richards FO; Eberhard ML. Changes in circulating parasite antigen levels after treatment of bancroftian filariasis with diethylcarbamazine and ivermectin. J Infect Dis (1991)
- Kumar A; Sachan P. Measuring impact on filarial infection status in a community study: role of coverage of mass drug administration. Trop Biomed (2014)
- Njenga SM; Mwandawiro CS; Wamae CN; Mukoko DA; Omar AA; Shimada M. Sustained reduction in prevalence of lymphatic filariasis infection in spite of missed rounds of mass drug administration in an area under mosquito nets for malaria control. Parasites and Vectors (2011)
- Boyd A; Won KY; McClintock SK; Donovan C V; Laney SJ; Williams SA. A community-based study of factors associated with continuing transmission of lymphatic filariasis in Leogane, Haiti. PLoS Negl Trop Dis (2010)
- Irvine MA; Reimer LJ; Njenga SM; Gunawardena S; Kelly-Hope L; Bockarie M. Modelling strategies to break transmission of lymphatic filariasis—aggregation, adherence and vector competence greatly alter elimination. Parasites and Vectors (2015)
- Irvine MA; Stolk WA; Smith ME; Subramanian S; Singh BK; Weil GJ. Effectiveness of a triple-drug regimen for global elimination of lymphatic filariasis: a modelling study. Lancet Infect Dis (2017)
- Pion SD; Montavon C; Chesnais CB; Kamgno J; Wanji S; Klion AD. Positivity of antigen tests used for diagnosis of lymphatic filariasis in individuals without Wuchereria bancrofti infection but with high loa loa microfilaremia. Am J Trop Med Hyg (2016)
- Wanji S; Esum ME; Njouendou AJ; Mbeng AA; Chounna Ndongmo PW; Abong RA. Mapping of lymphatic filariasis in loiasis areas: a new strategy shows no evidence for Wuchereria bancrofti endemicity in Cameroon. PLoS Negl Trop Dis (2018)
- Chesnais CB; Awaca-Uvon NP; Bolay FK; Boussinesq M; Fischer PU; Gankpala L. A multi-center field study of two point-of-care tests for circulating Wuchereria bancrofti antigenemia in Africa. PLoS Negl Trop Dis (2017)
- Silumbwe A; Zulu JM; Halwindi H; Jacobs C; Zgambo J; Dambe R. A systematic review of factors that shape implementation of mass drug administration for lymphatic filariasis in sub-Saharan Africa. BMC Public Health (2017)
- Adams AM; Vuckovic M; Birch E; Brant TA; Bialek S; Yoon D. Eliminating neglected tropical diseases in urban areas: a review of challenges, strategies and research directions for successful mass drug administration. Trop Med Infect Dis (2018)
- Rao RU; Samarasekera SD; Nagodavithana KC; Dassanayaka TDM; Punchihewa MW; Ranasinghe USB. Reassessment of areas with persistent lymphatic filariasis nine years after cessation of mass drug administration in Sri Lanka. PLoS Negl Trop Dis (2017)
- Xu Z; Graves PM; Lau CL; Clements A; Geard N; Glass K. GEOFIL: a spatially-explicit agent-based modelling framework for predicting the long-term transmission dynamics of lymphatic filariasis in American Samoa. Epidemics (2018)
- Id CM; Tettevi EJ; Mechan F; Idun B; Biritwum N; Osei-atweneboana MY. Elimination within reach: a cross-sectional study highlighting the factors that contribute to persistent lymphatic filariasis in eight communities in rural Ghana. PLoS Negl Trop Dis (2019)
- Eigege A; Kal A; Miri E; Sallau A; Umaru J; Mafuyai H. Long-lasting insecticidal nets are synergistic with mass drug administration for interruption of lymphatic filariasis transmission in Nigeria. PLoS Negl Trop Dis (2013)
- Van den Berg H; Kelly-Hope LA; Lindsay SW. Malaria and lymphatic filariasis: The case for integrated vector management. Lancet Infect Dis (2013)
- Webber R.. Eradication of Wuchereria bancrofti infection through vector control. Trans R Soc Trop Med Hyg (1979)

View File

@ -0,0 +1,177 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Potential to reduce greenhouse g ... cattle systems in subtropical regions
item-2 at level 2: paragraph: Ribeiro-Filho Henrique M. N.; 1: ... , California, United States of America
item-3 at level 2: section_header: Abstract
item-4 at level 3: text: Carbon (C) footprint of dairy pr ... uce the C footprint to a small extent.
item-5 at level 2: section_header: Introduction
item-6 at level 3: text: Greenhouse gas (GHG) emissions f ... suitable for food crop production [4].
item-7 at level 3: text: Considering the key role of live ... anagement to mitigate the C footprint.
item-8 at level 3: text: In subtropical climate zones, co ... t in tropical pastures (e.g. [1719]).
item-9 at level 3: text: It has been shown that dairy cow ... sions from crop and reduced DM intake.
item-10 at level 3: text: The aim of this work was to quan ... uring lactation periods was evaluated.
item-11 at level 2: section_header: Materials and methods
item-12 at level 3: text: An LCA was developed according t ... 90816 - https://www.udesc.br/cav/ceua.
item-13 at level 3: section_header: System boundary
item-14 at level 4: text: The goal of the study was to ass ... n were outside of the system boundary.
item-15 at level 3: section_header: Functional unit
item-16 at level 4: text: The functional unit was one kilo ... tein according to NRC [20] as follows:
item-17 at level 4: text: ECM = Milk production × (0.0929 ... characteristics described in Table 1.
item-18 at level 3: section_header: Data sources and livestock system description
item-19 at level 4: text: The individual feed requirements ... ed to the ad libitum TMR intake group.
item-20 at level 4: text: Using experimental data, three s ... med during an entire lactation period.
item-21 at level 3: section_header: Impact assessment
item-22 at level 4: text: The CO2e emissions were calculat ... 65 for CO2, CH4 and N2O, respectively.
item-23 at level 3: section_header: Feed production
item-24 at level 4: section_header: Diets composition
item-25 at level 5: text: The DM intake of each ingredient ... collected throughout the experiments.
item-26 at level 4: section_header: GHG emissions from crop and pasture production
item-27 at level 5: text: GHG emission factors used for of ... onsume 70% of pastures during grazing.
item-28 at level 5: text: Emissions from on-farm feed prod ... factors described by Rotz et al. [42].
item-29 at level 3: section_header: Animal husbandry
item-30 at level 4: text: The CH4 emissions from enteric f ... 1) = 13.8 + 0.185 × NDF (% DM intake).
item-31 at level 3: section_header: Manure from confined cows and urine and dung from grazing animals
item-32 at level 4: text: The CH4 emission from manure (kg ... for dietary GE per kg of DM (MJ kg-1).
item-33 at level 4: text: The OM digestibility was estimat ... h were 31%, 26% and 46%, respectively.
item-34 at level 4: text: The N2O-N emissions from urine a ... using the IPCC [38] emission factors.
item-35 at level 3: section_header: Farm management
item-36 at level 4: text: Emissions due to farm management ... crop and pasture production section.
item-37 at level 4: text: The amount of fuel use for manur ... me that animals stayed on confinement.
item-38 at level 4: text: The emissions from fuel were est ... × kg CO2e (kg machinery mass)-1 [42].
item-39 at level 4: text: Emissions from electricity for m ... ws in naturally ventilated barns [47].
item-40 at level 4: text: The lower impact of emissions fr ... greater than 5% of total C footprint.
item-41 at level 4: text: Emissions from farm management d ... gas and hard coal, respectively [46].
item-42 at level 3: section_header: Co-product allocation
item-43 at level 4: text: The C footprint for milk produce ... directly assigned to milk production.
item-44 at level 3: section_header: Sensitivity analysis
item-45 at level 4: text: A sensitivity index was calculat ... ses a similar change in the footprint.
item-46 at level 2: section_header: Results and discussion
item-47 at level 3: text: The study has assessed the impac ... , feed production and electricity use.
item-48 at level 3: section_header: Greenhouse gas emissions
item-49 at level 4: text: Depending on emission factors us ... more than 5% of overall GHG emissions.
item-50 at level 4: text: Considering IPCC emission factor ... the C footprint of the dairy systems.
item-51 at level 4: text: The similarity of C footprint be ... of TMR was replaced by pasture access.
item-52 at level 4: text: The lower C footprint in scenari ... r, averaging 0.004 kg N2O-N kg-1 [37].
item-53 at level 3: section_header: Methane emissions
item-54 at level 4: text: The enteric CH4 intensity was si ... ], which did not happen in this study.
item-55 at level 4: text: The lack of difference in enteri ... same scenarios as in this study [26].
item-56 at level 3: section_header: Emissions from excreta and feed production
item-57 at level 4: text: Using IPCC emission factors for ... may not be captured by microbes [65].
item-58 at level 4: text: Using local emission factors for ... be revised for the subtropical region.
item-59 at level 4: text: Emissions for feed production de ... act, particularly in confinements [9].
item-60 at level 3: section_header: Assumptions and limitations
item-61 at level 4: text: The milk production and composit ... ions as a function of soil management.
item-62 at level 3: section_header: Further considerations
item-63 at level 4: text: The potential for using pasture ... g ECM)-1 in case of foot lesions [72].
item-64 at level 4: text: Grazing lands may also improve b ... hange of CO2 would be negligible [76].
item-65 at level 2: section_header: Conclusions
item-66 at level 3: text: This study assessed the C footpr ... on with or without access to pastures.
item-67 at level 2: section_header: Tables
item-68 at level 3: table with [13x3]
item-68 at level 4: caption: Table 1: Descriptive characteristics of the herd.
item-69 at level 3: table with [21x11]
item-69 at level 4: caption: Table 2: Dairy cows diets in different scenariosa.
item-70 at level 3: table with [9x5]
item-70 at level 4: caption: Table 3: GHG emission factors for Off- and On-farm feed production.
item-71 at level 3: table with [28x5]
item-71 at level 4: caption: Table 4: GHG emissions from On-farm feed production.
item-72 at level 3: table with [12x4]
item-72 at level 4: caption: Table 5: Factors for major resource inputs in farm management.
item-73 at level 2: section_header: Figures
item-74 at level 3: picture
item-74 at level 4: caption: Fig 1: Overview of the milk production system boundary considered in the study.
item-75 at level 3: picture
item-75 at level 4: caption: Fig 2: Overall greenhouse gas emissions in dairy cattle systems under various scenarios.
TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
item-76 at level 3: picture
item-76 at level 4: caption: Fig 3: Sensitivity of the C footprint.
Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
item-77 at level 3: picture
item-77 at level 4: caption: Fig 4: Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems.
TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.
item-78 at level 2: section_header: References
item-79 at level 3: list: group list
item-80 at level 4: list_item: Climate Change and Land. Chapter 5: Food Security (2019)
item-81 at level 4: list_item: Herrero M; Henderson B; Havlík P ... ivestock sector. Nat Clim Chang (2016)
item-82 at level 4: list_item: Rivera-Ferre MG; López-i-Gelats ... iley Interdiscip Rev Clim Chang (2016)
item-83 at level 4: list_item: van Zanten HHE; Mollenhorst H; K ... ystems. Int J Life Cycle Assess (2016)
item-84 at level 4: list_item: Hristov AN; Oh J; Firkins L; Dij ... mitigation options. J Anim Sci (2013)
item-85 at level 4: list_item: Hristov AN; Ott T; Tricarico J; ... mitigation options. J Anim Sci (2013)
item-86 at level 4: list_item: Montes F; Meinen R; Dell C; Rotz ... mitigation options. J Anim Sci (2013)
item-87 at level 4: list_item: Ledgard SF; Wei S; Wang X; Falco ... mitigations. Agric Water Manag (2019)
item-88 at level 4: list_item: OBrien D; Shalloo L; Patton J; ... inement dairy farms. Agric Syst (2012)
item-89 at level 4: list_item: Salou T; Le Mouël C; van der Wer ... nal unit matters!. J Clean Prod (2017)
item-90 at level 4: list_item: Lizarralde C; Picasso V; Rotz CA ... Case Studies. Sustain Agric Res (2014)
item-91 at level 4: list_item: Clark CEF; Kaur R; Millapan LO; ... ction and behavior. J Dairy Sci (2018)
item-92 at level 4: list_item: FAOSTAT. (2017)
item-93 at level 4: list_item: Vogeler I; Mackay A; Vibart R; R ... ms modelling. Sci Total Environ (2016)
item-94 at level 4: list_item: Wilkinson JM; Lee MRF; Rivero MJ ... ate pastures. Grass Forage Sci. (2020)
item-95 at level 4: list_item: Wales WJ; Marett LC; Greenwood J ... ons of Australia. Anim Prod Sci (2013)
item-96 at level 4: list_item: Bargo F; Muller LD; Delahoy JE; ... otal mixed rations. J Dairy Sci (2002)
item-97 at level 4: list_item: Vibart RE; Fellner V; Burns JC; ... ration and pasture. J Dairy Res (2008)
item-98 at level 4: list_item: Mendoza A; Cajarville C; Repetto ... total mixed ration. J Dairy Sci (2016)
item-99 at level 4: list_item: Nutrient Requirements of Dairy Cattle (2001)
item-100 at level 4: list_item: Noizère P; Sauvant D; Delaby L. (2018)
item-101 at level 4: list_item: Lorenz H; Reinsch T; Hess S; Tau ... roduction systems. J Clean Prod (2019)
item-102 at level 4: list_item: INTERNATIONAL STANDARD—Environme ... ent—Requirements and guidelines (2006)
item-103 at level 4: list_item: Environmental management—Life cy ... ciples and framework. Iso 14040 (2006)
item-104 at level 4: list_item: FAO. Environmental Performance o ... ains: Guidelines for assessment (2016)
item-105 at level 4: list_item: Civiero M; Ribeiro-Filho HMN; Sc ... ture Conference,. Foz do Iguaçu (2019)
item-106 at level 4: list_item: IPCC—Intergovernmental Panel on ... d Version). 2014. Available: ttps://.
item-107 at level 4: list_item: INRA. Alimentation des bovins, o ... nra 2007. 4th ed. INRA, editor. 2007.
item-108 at level 4: list_item: Delagarde R; Faverdin P; Baratte ... ng management. Grass Forage Sci (2011)
item-109 at level 4: list_item: Ma BL; Liang BC; Biswas DK; Morr ... tions. Nutr Cycl Agroecosystems (2012)
item-110 at level 4: list_item: Rauccci GS; Moreira CS; Alves PS ... Mato Grosso State. J Clean Prod (2015)
item-111 at level 4: list_item: Camargo GGT; Ryan MR; Richard TL ... nergy Analysis Tool. Bioscience (2013)
item-112 at level 4: list_item: da Silva MSJ; Jobim CC; Poppi EC ... outhern Brazil. Rev Bras Zootec (2015)
item-113 at level 4: list_item: Duchini PGPG Guzatti GCGC; Ribei ... monocultures. Crop Pasture Sci (2016)
item-114 at level 4: list_item: Scaravelli LFB; Pereira LET; Oli ... om vacas leiteiras. Cienc Rural (2007)
item-115 at level 4: list_item: Sbrissia AF; Duchini PG; Zanini ... ge of grazing heights. Crop Sci (2018)
item-116 at level 4: list_item: Almeida JGR; Dall-Orsoletta AC; ... grazing temperate grass. Animal (2020)
item-117 at level 4: list_item: Eggleston H.S.; Buendia L.; Miwa ... nal greenhouse gas inventories. (2006)
item-118 at level 4: list_item: Ramalho B; Dieckow J; Barth G; S ... mbric Ferralsol. Eur J Soil Sci (2020)
item-119 at level 4: list_item: Fernandes HC; da Silveira JCM; R ... nizadas. Cienc e Agrotecnologia (2008)
item-120 at level 4: list_item: Wang M Q. GREET 1.8a Spreadsheet Model. 2007. Available: .
item-121 at level 4: list_item: Rotz CAA; Montes F; Chianese DS; ... e cycle assessment. J Dairy Sci (2010)
item-122 at level 4: list_item: Niu M; Kebreab E; Hristov AN; Oh ... ental database. Glob Chang Biol (2018)
item-123 at level 4: list_item: Eugène M; Sauvant D; Nozière P; ... for ruminants. J Environ Manage (2019)
item-124 at level 4: list_item: Reed KF; Moraes LE; Casper DP; K ... retion from cattle. J Dairy Sci (2015)
item-125 at level 4: list_item: Barros MV; Piekarski CM; De Fran ... the 20162026 period. Energies (2018)
item-126 at level 4: list_item: Ludington D; Johnson E. Dairy Fa ... York State Energy Res Dev Auth (2003)
item-127 at level 4: list_item: Thoma G; Jolliet O; Wang Y. A bi ... ply chain analysis. Int Dairy J (2013)
item-128 at level 4: list_item: Naranjo A; Johnson A; Rossow H. ... dairy industry over 50 years. (2020)
item-129 at level 4: list_item: Jayasundara S; Worden D; Weersin ... roduction systems. J Clean Prod (2019)
item-130 at level 4: list_item: Williams SRO; Fisher PD; Berrisf ... ssions. Int J Life Cycle Assess (2014)
item-131 at level 4: list_item: Gollnow S; Lundie S; Moore AD; M ... cows in Australia. Int Dairy J (2014)
item-132 at level 4: list_item: OBrien D; Capper JL; Garnsworth ... -based dairy farms. J Dairy Sci (2014)
item-133 at level 4: list_item: Chobtang J; McLaren SJ; Ledgard ... Region, New Zealand. J Ind Ecol (2017)
item-134 at level 4: list_item: Garg MR; Phondba BT; Sherasia PL ... cycle assessment. Anim Prod Sci (2016)
item-135 at level 4: list_item: de Léis CM; Cherubini E; Ruviaro ... study. Int J Life Cycle Assess (2015)
item-136 at level 4: list_item: OBrien D; Geoghegan A; McNamara ... otprint of milk?. Anim Prod Sci (2016)
item-137 at level 4: list_item: OBrien D; Brennan P; Humphreys ... dology. Int J Life Cycle Assess (2014)
item-138 at level 4: list_item: Baek CY; Lee KM; Park KH. Quanti ... dairy cow system. J Clean Prod (2014)
item-139 at level 4: list_item: Dall-Orsoletta AC; Almeida JGR; ... to late lactation. J Dairy Sci (2016)
item-140 at level 4: list_item: Dall-Orsoletta AC; Oziemblowski ... entation. Anim Feed Sci Technol (2019)
item-141 at level 4: list_item: Niu M; Appuhamy JADRN; Leytem AB ... s simultaneously. Anim Prod Sci (2016)
item-142 at level 4: list_item: Waghorn GC; Law N; Bryant M; Pac ... with fodder beet. Anim Prod Sci (2019)
item-143 at level 4: list_item: Dickhoefer U; Glowacki S; Gómez ... protein and starch. Livest Sci (2018)
item-144 at level 4: list_item: Schwab CG; Broderick GA. A 100-Y ... tion in dairy cows. J Dairy Sci (2017)
item-145 at level 4: list_item: Sordi A; Dieckow J; Bayer C; Alb ... tureland. Agric Ecosyst Environ (2014)
item-146 at level 4: list_item: Simon PL; Dieckow J; de Klein CA ... pastures. Agric Ecosyst Environ (2018)
item-147 at level 4: list_item: Wang X; Ledgard S; Luo J; Guo Y; ... e assessment. Sci Total Environ (2018)
item-148 at level 4: list_item: Pirlo G; Lolli S. Environmental ... Lombardy (Italy). J Clean Prod (2019)
item-149 at level 4: list_item: Herzog A; Winckler C; Zollitsch ... tigation. Agric Ecosyst Environ (2018)
item-150 at level 4: list_item: Mostert PF; van Middelaar CE; Bo ... f milk production. J Clean Prod (2018)
item-151 at level 4: list_item: Mostert PF; van Middelaar CE; de ... of milk production. Agric Syst (2018)
item-152 at level 4: list_item: Foley JA; Ramankutty N; Brauman ... for a cultivated planet. Nature (2011)
item-153 at level 4: list_item: Lal R.. Soil Carbon Sequestratio ... nd Food Security. Science (80-) (2004)
item-154 at level 4: list_item: Boddey RM; Jantalia CP; Conceiça ... al agriculture. Glob Chang Biol (2010)
item-155 at level 4: list_item: McConkey B; Angers D; Bentham M; ... he LULUCF sector for NIR 2014. (2014)
item-156 at level 1: caption: Table 1: Descriptive characteristics of the herd.
item-157 at level 1: caption: Table 2: Dairy cows diets in different scenariosa.
item-158 at level 1: caption: Table 3: GHG emission factors for Off- and On-farm feed production.
item-159 at level 1: caption: Table 4: GHG emissions from On-farm feed production.
item-160 at level 1: caption: Table 5: Factors for major resource inputs in farm management.
item-161 at level 1: caption: Fig 1: Overview of the milk prod ... stem boundary considered in the study.
item-162 at level 1: caption: Fig 2: Overall greenhouse gas em ... lectricity = 0.205 kg CO2e kWh-1 [46].
item-163 at level 1: caption: Fig 3: Sensitivity of the C foot ... lectricity = 0.205 kg CO2e kWh-1 [46].
item-164 at level 1: caption: Fig 4: Greenhouse gas emissions ... uestered CO2-C from perennial pasture.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,336 @@
# Potential to reduce greenhouse gas emissions through different dairy cattle systems in subtropical regions
Ribeiro-Filho Henrique M. N.; 1: Department of Animal Science, University of California, Davis, California, United States of America, 2: Programa de Pós-graduação em Ciência Animal, Universidade do Estado de Santa Catarina, Lages, Santa Catarina, Brazil; Civiero Maurício; 2: Programa de Pós-graduação em Ciência Animal, Universidade do Estado de Santa Catarina, Lages, Santa Catarina, Brazil; Kebreab Ermias; 1: Department of Animal Science, University of California, Davis, California, United States of America
## Abstract
Carbon (C) footprint of dairy production, expressed in kg C dioxide (CO2) equivalents (CO2e) (kg energy-corrected milk (ECM))-1, encompasses emissions from feed production, diet management and total product output. The proportion of pasture on diets may affect all these factors, mainly in subtropical climate zones, where cows may access tropical and temperate pastures during warm and cold seasons, respectively. The aim of the study was to assess the C footprint of a dairy system with annual tropical and temperate pastures in a subtropical region. The system boundary included all processes up to the animal farm gate. Feed requirement during the entire life of each cow was based on data recorded from Holstein × Jersey cow herds producing an average of 7,000 kg ECM lactation-1. The milk production response as consequence of feed strategies (scenarios) was based on results from two experiments (warm and cold seasons) using lactating cows from the same herd. Three scenarios were evaluated: total mixed ration (TMR) ad libitum intake, 75, and 50% of ad libitum TMR intake with access to grazing either a tropical or temperate pasture during lactation periods. Considering IPCC and international literature values to estimate emissions from urine/dung, feed production and electricity, the C footprint was similar between scenarios, averaging 1.06 kg CO2e (kg ECM)-1. Considering factors from studies conducted in subtropical conditions and actual inputs for on-farm feed production, the C footprint decreased 0.04 kg CO2e (kg ECM)-1 in scenarios including pastures compared to ad libitum TMR. Regardless of factors considered, emissions from feed production decreased as the proportion of pasture went up. In conclusion, decreasing TMR intake and including pastures in dairy cow diets in subtropical conditions have the potential to maintain or reduce the C footprint to a small extent.
## Introduction
Greenhouse gas (GHG) emissions from livestock activities represent 1012% of global emissions [1], ranging from 5.57.5 Gt CO2 equivalents (CO2e) yr-1, with almost 30% coming from dairy cattle production systems [2]. However, the livestock sector supply between 13 and 17% of calories and between 28 and 33% of human edible protein consumption globally [3]. Additionally, livestock produce more human-edible protein per unit area than crops when land is unsuitable for food crop production [4].
Considering the key role of livestock systems in global food security, several technical and management interventions have been investigated to mitigate methane (CH4) emissions from enteric fermentation [5], animal management [6] and manure management [7]. CH4 emissions from enteric fermentation represents around 34% of total emissions from livestock sector, which is the largest source [2]. Increasing proportions of concentrate and digestibility of forages in the diet have been proposed as mitigation strategies [1,5]. In contrast, some life cycle assessment (LCA) studies of dairy systems in temperate regions [811] have identified that increasing concentrate proportion may increase carbon (C) footprint due to greater resource use and pollutants from the production of feed compared to forage. Thus, increasing pasture proportion on dairy cattle systems may be an alternative management to mitigate the C footprint.
In subtropical climate zones, cows may graze tropical pastures rather than temperate pastures during the warm season [12]. Some important dairy production areas, such as southern Brazil, central to northern Argentina, Uruguay, South Africa, New Zealand and Australia, are located in these climate zones, having more than 900 million ha in native, permanent or temporary pastures, producing almost 20% of global milk production [13]. However, due to a considerable inter-annual variation in pasture growth rates [14,15], the interest in mixed systems, using total mixed ration (TMR) + pasture has been increasing [16]. Nevertheless, to our best knowledge, studies conducted to evaluate milk production response in dairy cow diets receiving TMR and pastures have only been conducted in temperate pastures and not in tropical pastures (e.g. [1719]).
It has been shown that dairy cows receiving TMR-based diets may not decrease milk production when supplemented with temperate pastures in a vegetative growth stage [18]. On the other hand, tropical pastures have lower organic matter digestibility and cows experience reduced dry matter (DM) intake and milk yield compared to temperate pastures [20,21]. A lower milk yield increases the C footprint intensity [22], offsetting an expected advantage through lower GHG emissions from crop and reduced DM intake.
The aim of this work was to quantify the C footprint and land use of dairy systems using cows with a medium milk production potential in a subtropical region. The effect of replacing total mixed ration (TMR) with pastures during lactation periods was evaluated.
## Materials and methods
An LCA was developed according to the ISO standards [23,24] and Food and Agriculture Organization of the United Nations (FAO) Livestock Environmental Assessment Protocol guidelines [25]. All procedures were approved by the Comissão de Ética no Uso de Animais (CEUA/UDESC) on September 15, 2016—Approval number 4373090816 - https://www.udesc.br/cav/ceua.
### System boundary
The goal of the study was to assess the C footprint of annual tropical and temperate pastures in lactating dairy cow diets. The production system was divided into four main processes: (i) animal husbandry, (ii) manure management and urine and dung deposited by grazing animals, (iii) production of feed ingredients and (iv) farm management (Fig 1). The study boundary included all processes up to the animal farm gate (cradle to gate), including secondary sources such as GHG emissions during the production of fuel, electricity, machinery, manufacturing of fertilizer, pesticides, seeds and plastic used in silage production. Fuel combustion and machinery (manufacture and repairs) for manure handling and electricity for milking and confinement were accounted as emissions from farm management. Emissions post milk production were assumed to be similar for all scenarios, therefore, activities including milk processing, distribution, retail or consumption were outside of the system boundary.
### Functional unit
The functional unit was one kilogram of energy-corrected milk (ECM) at the farm gate. All processes in the system were calculated based on one kilogram ECM. The ECM was calculated by multiplying milk production by the ratio of the energy content of the milk to the energy content of standard milk with 4% fat and 3.3% true protein according to NRC [20] as follows:
ECM = Milk production × (0.0929 × fat% + 0.0588× true protein% + 0.192) / (0.0929 × (4%) + 0.0588 × (3.3%) + 0.192), where fat% and protein% are fat and protein percentages in milk, respectively. The average milk production and composition were recorded from the University of Santa Catarina State (Brazil) herd, considering 165 lactations between 2009 and 2018. The herd is predominantly Holstein × Jersey cows, with key characteristics described in Table 1.
### Data sources and livestock system description
The individual feed requirements, as well as the milk production responses based on feed strategies were based on data recorded from the herd described above and two experiments performed using lactating cows from the same herd. Due to the variation on herbage production throughout the year, feed requirements were estimated taking into consideration that livestock systems have a calving period in April, which represents the beginning of fall season in the southern Hemisphere. The experiments have shown a 10% reduction in ECM production in dairy cows that received both 75 and 50% of ad libitum TMR intake with access to grazing a tropical pasture (pearl-millet, Pennisetum glaucum Campeiro) compared to cows receiving ad libitum TMR intake. Cows grazing on a temperate pasture (ryegrass, Lolium multiflorum Maximus) did not need changes to ECM production compared to the ad libitum TMR intake group.
Using experimental data, three scenarios were evaluated during the lactation period: ad libitum TMR intake, and 75, and 50% of ad libitum TMR intake with access to grazing either an annual tropical or temperate pasture as a function of month ([26], Civiero et al., in press). From April to October (210 days) cows accessed an annual temperate pasture (ryegrass), and from November to beginning of February (95 days) cows grazed an annual tropical pasture (pearl-millet). The average annual reduction in ECM production in dairy cows with access to pastures is 3%. This value was assumed during an entire lactation period.
### Impact assessment
The CO2e emissions were calculated by multiplying the emissions of CO2, CH4 and N2O by their 100-year global warming potential (GWP100), based on IPCC assessment report 5 (AR5; [27]). The values of GWP100 are 1, 28 and 265 for CO2, CH4 and N2O, respectively.
### Feed production
#### Diets composition
The DM intake of each ingredient throughout the entire life of animals during lactation periods was calculated for each scenario: cows receiving only TMR, cows receiving 75% of TMR with annual pastures and cows receiving 50% of TMR with annual pastures (Table 2). In each of other phases of life (calf, heifer, dry cow), animals received the same diet, including a perennial tropical pasture (kikuyu grass, Pennisetum clandestinum). The DM intake of calves, heifers and dry cows was calculated assuming 2.8, 2.5 and 1.9% body weight, respectively [20]. In each case, the actual DM intake of concentrate and corn silage was recorded, and pasture DM intake was estimated by the difference between daily expected DM intake and actual DM intake of concentrate and corn silage. For lactating heifers and cows, TMR was formulated to meet the net energy for lactation (NEL) and metabolizable protein (MP) requirements of experimental animals, according to [28]. The INRA system was used because it is possible to estimate pasture DM intake taking into account the TMR intake, pasture management and the time of access to pasture using the GrazeIn model [29], which was integrated in the software INRAtion 4.07 (https://www.inration.educagri.fr/fr/forum.php). The nutrient intake was calculated as a product of TMR and pasture intake and the nutrient contents of TMR and pasture, respectively, which were determined in feed samples collected throughout the experiments.
#### GHG emissions from crop and pasture production
GHG emission factors used for off- and on-farm feed production were based on literature values, and are presented in Table 3. The emission factor used for corn grain is the average of emission factors observed in different levels of synthetic N fertilization [30]. The emission factor used for soybean is based on Brazilian soybean production [31]. The emissions used for corn silage, including feed processing (cutting, crushing and mixing), and annual or perennial grass productions were 3300 and 1500 kg CO2e ha-1, respectively [32]. The DM production (kg ha-1) of corn silage and pastures were based on regional and locally recorded data [3336], assuming that animals are able to consume 70% of pastures during grazing.
Emissions from on-farm feed production (corn silage and pasture) were estimated using primary and secondary sources based on the actual amount of each input (Table 4). Primary sources were direct and indirect N2O-N emissions from organic and synthetic fertilizers and crop/pasture residues, CO2-C emissions from lime and urea applications, as well as fuel combustion. The direct N2O-N emission factor (kg (kg N input)-1) is based on a local study performed previously [37]. For indirect N2O-N emissions (kg N2O-N (kg NH3-N + NOx)-1), as well as CO2-C emissions from lime + urea, default values proposed by IPCC [38] were used. For perennial pastures, a C sequestration of 0.57 t ha-1 was used based on a 9-year study conducted in southern Brazil [39]. Due to the use of conventional tillage, no C sequestration was considered for annual pastures. The amount of fuel required was 8.9 (no-tillage) and 14.3 L ha-1 (disking) for annual tropical and temperate pastures, respectively [40]. The CO2 from fuel combustion was 2.7 kg CO2 L-1 [41]. Secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were estimated using emission factors described by Rotz et al. [42].
### Animal husbandry
The CH4 emissions from enteric fermentation intensity (g (kg ECM)-1) was a function of estimated CH4 yield (g (kg DM intake)-1), actual DM intake and ECM. The enteric CH4 yield was estimated as a function of neutral detergent fiber (NDF) concentration on total DM intake, as proposed by Niu et al. [43], where: CH4 yield (g (kg DM intake)-1) = 13.8 + 0.185 × NDF (% DM intake).
### Manure from confined cows and urine and dung from grazing animals
The CH4 emission from manure (kg (kg ECM)-1) was a function of daily CH4 emission from manure (kg cow-1) and daily ECM (kg cow-1). The daily CH4 emission from manure was estimated according to IPCC [38], which considered daily volatile solid (VS) excreted (kg DM cow-1) in manure. The daily VS was estimated as proposed by Eugène et al. [44] as: VS = NDOMI + (UE × GE) × (OM/18.45), where: VS = volatile solid excretion on an organic matter (OM) basis (kg day-1), NDOMI = non-digestible OM intake (kg day-1): (1- OM digestibility) × OM intake, UE = urinary energy excretion as a fraction of GE (0.04), GE = gross energy intake (MJ day-1), OM = organic matter (g), 18.45 = conversion factor for dietary GE per kg of DM (MJ kg-1).
The OM digestibility was estimated as a function of chemical composition, using equations published by INRA [21], which takes into account the effects of digestive interactions due to feeding level, the proportion of concentrate and rumen protein balance on OM digestibility. For scenarios where cows had access to grazing, the amount of calculated VS were corrected as a function of the time at pasture. The biodegradability of manure factor (0.13 for dairy cows in Latin America) and methane conversion factor (MCF) values were taken from IPCC [38]. The MCF values for pit storage below animal confinements (> 1 month) were used for the calculation, taking into account the annual average temperature (16.6ºC) or the average temperatures during the growth period of temperate (14.4ºC) or tropical (21ºC) annual pastures, which were 31%, 26% and 46%, respectively.
The N2O-N emissions from urine and feces were estimated considering the proportion of N excreted as manure and storage or as urine and dung deposited by grazing animals. These proportions were calculated based on the proportion of daily time that animals stayed on pasture (7 h/24 h = 0.29) or confinement (10.29 = 0.71). For lactating heifers and cows, the total amount of N excreted was calculated by the difference between N intake and milk N excretion. For heifers and non-lactating cows, urinary and fecal N excretion were estimated as proposed by Reed et al. [45] (Table 3: equations 10 and 12, respectively). The N2O emissions from stored manure as well as urine and dung during grazing were calculated based on the conversion of N2O-N emissions to N2O emissions, where N2O emissions = N2O-N emissions × 44/28. The emission factors were 0.002 kg N2O-N (kg N)-1 stored in a pit below animal confinements, and 0.02 kg N2O-N (kg of urine and dung)-1 deposited on pasture [38]. The indirect N2O emissions from storage manure and urine and dung deposits on pasture were also estimated using the IPCC [38] emission factors.
### Farm management
Emissions due to farm management included those from fuel and machinery for manure handling and electricity for milking and confinement (Table 5). Emissions due to feed processing such as cutting, crushing, mixing and distributing, as well as secondary sources of emissions during the production of fuel, machinery, fertilizer, pesticides, seeds and plastic for ensilage were included in Emissions from crop and pasture production section.
The amount of fuel use for manure handling were estimated taking into consideration the amount of manure produced per cow and the amounts of fuel required for manure handling (L diesel t-1) [42]. The amount of manure was estimated from OM excretions (kg cow-1), assuming that the manure has 8% ash on DM basis and 60% DM content. The OM excretions were calculated by NDOMI × days in confinement × proportion of daily time that animals stayed on confinement.
The emissions from fuel were estimated considering the primary (emissions from fuel burned) and secondary (emissions for producing and transporting fuel) emissions. The primary emissions were calculated by the amount of fuel required for manure handling (L) × (kg CO2e L-1) [41]. The secondary emissions from fuel were calculated by the amount of fuel required for manure handling × emissions for production and transport of fuel (kg CO2e L-1) [41]. Emissions from manufacture and repair of machinery for manure handling were estimated by manure produced per cow (t) × (kg machinery mass (kg manure)-1 × 103) [42] × kg CO2e (kg machinery mass)-1 [42].
Emissions from electricity for milking and confinement were estimated using two emission factors (kg CO2 kWh-1). The first one is based on United States electricity matrix [41], and was used as a reference of an electricity matrix with less hydroelectric power than the region under study. The second is based on the Brazilian electricity matrix [46]. The electricity required for milking activities is 0.06 kWh (kg milk produced)-1 [47]. The annual electricity use for lighting was 75 kWh cow-1, which is the value considered for lactating cows in naturally ventilated barns [47].
The lower impact of emissions from farm management is in agreement with other studies conducted in Europe [9, 62] and USA [42, 55], where the authors found that most emissions in dairy production systems are from enteric fermentation, feed production and emissions from excreta. As emissions from fuel for on-farm feed production were accounted into the emissions from crop and pasture production, total emissions from farm management were not greater than 5% of total C footprint.
Emissions from farm management dropped when the emission factor for electricity generation was based on the Brazilian matrix. In this case, the emission factor for electricity generation (0.205 kg CO2e kWh-1 [46]) is much lower than that in a LCA study conducted in US (0.73 kg CO2e kWh-1 [42]). This apparent discrepancy is explained because in 2016, almost 66% of the electricity generated in Brazil was from hydropower, which has an emission factor of 0.074 kg CO2e kWh-1 against 0.382 and 0.926 kg CO2e kWh-1 produced by natural gas and hard coal, respectively [46].
### Co-product allocation
The C footprint for milk produced in the system was calculated using a biophysical allocation approach, as recommended by the International Dairy Federation [49], and described by Thoma et al. [48]. Briefly, ARmilk = 16.04 × BMR, where: ARmilk is the allocation ratio for milk and BMR is cow BW at the time of slaughter (kg) + calf BW sold (kg) divided by the total ECM produced during cow`s entire life (kg). The ARmilk were 0.854 and 0.849 for TMR and TMR with both pasture scenarios, respectively. The ARmilk was applied to the whole emissions, except for the electricity consumed for milking (milking parlor) and refrigerant loss, which was directly assigned to milk production.
### Sensitivity analysis
A sensitivity index was calculated as described by Rotz et al. [42]. The sensitivity index was defined for each emission source as the percentage change in the C footprint for a 10% change in the given emission source divided by 10%. Thus, a value near 0 indicates a low sensitivity, whereas an index near or greater than 1 indicates a high sensitivity because a change in this value causes a similar change in the footprint.
## Results and discussion
The study has assessed the impact of tropical and temperate pastures in dairy cows fed TMR on the C footprint of dairy production in subtropics. Different factors were taken in to consideration to estimate emissions from manure (or urine and dung) of grazing animals, feed production and electricity use.
### Greenhouse gas emissions
Depending on emission factors used for calculating emissions from urine and dung (IPCC or local data) and feed production (Tables 3 or 4), the C footprint was similar (Fig 2A and 2B) or decreased by 0.04 kg CO2e (kg ECM)-1 (Fig 2C and 2D) in scenarios that included pastures compared to ad libitum TMR intake. Due to differences in emission factors, the overall GHG emission values ranged from 0.92 to 1.04 kg CO2e (kg ECM)-1 for dairy cows receiving TMR exclusively, and from 0.88 to 1.04 kg CO2e (kg ECM)-1 for cows with access to pasture. Using IPCC emission factors [38], manure emissions increased as TMR intake went down (Fig 2A and 2B). However, using local emission factors for estimating N2O-N emissions [37], manure emissions decreased as TMR intake went down (Fig 2C and 2D). Regardless of emission factors used (Tables 3 or 4), emissions from feed production decreased to a small extent as the proportion of TMR intake decreased. Emissions from farm management did not contribute more than 5% of overall GHG emissions.
Considering IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, the C footprint ranged from 0.99 to 1.04 kg CO2e (kg ECM)-1, and was close to those reported under confined based systems in California [49], Canada [50], China [8], Ireland [9], different scenarios in Australia [51,52] and Uruguay [11], which ranged from 0.98 to 1.16 kg CO2e (kg ECM)-1. When local emission factors for N2O emissions from urine and dung [37] and those from Table 4 were taking into account, the C footprint for scenarios including pasture, without accounting for sequestered CO2-C from perennial pasture—0.91 kg CO2e (kg ECM)-1—was lower than the range of values described above. However, these values were still greater than high-performance confinement systems in UK and USA [53] or grass based dairy systems in Ireland [9,53] and New Zealand [8,54], which ranged from 0.52 to 0.89 kg CO2e (kg ECM)-1. Regardless of which emission factor was used, we found a lower C footprint in all conditions compared to scenarios with lower milk production per cow or in poor conditions of manure management, which ranged from 1.4 to 2.3 kg CO2e (kg ECM)-1 [8,55]. Thus, even though differences between studies may be partially explained by various assumptions (e.g., emission factors, co-product allocation, methane emissions estimation, sequestered CO2-C, etc.), herd productivity and manure management were systematically associated with the C footprint of the dairy systems.
The similarity of C footprint between different scenarios using IPCC [38] for estimating emissions from manure and for emissions from feed production (Table 3) was a consequence of the trade-off between greater manure emissions and lower emissions to produce feed, as the proportion of pasture in diets increased. Additionally, the small negative effect of pasture on ECM production also contributed to the trade-off. The impact of milk production on the C footprint was reported in a meta-analysis comprising 30 studies from 15 different countries [22]. As observed in this study (Fig 2A and 2B) the authors reported no significant difference between the C footprint of pasture-based vs. confinement systems. However, they observed that an increase of 1000 kg cow-1 (5000 to 6000 kg ECM) reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, which may explain an apparent discrepancy between our study and an LCA performed in south Brazilian conditions [56]. Their study compared a confinement and a grazing-based dairy system with annual average milk production of 7667 and 5535 kg cow, respectively. In this study, the same herd was used in all systems, with an annual average milk production of around 7000 kg cow-1. Experimental data showed a reduction not greater than 3% of ECM when 50% of TMR was replaced by pasture access.
The lower C footprint in scenarios with access to pasture, when local emission factors [37] were used for N2O emissions from urine and dung and for feed production (Table 4), may also be partially attributed to the small negative effect of pasture on ECM production. Nevertheless, local emission factors for urine and dung had a great impact on scenarios including pastures compared to ad libitum TMR intake. Whereas the IPCC [38] considers an emission of 0.02 kg N2O-N (kg N)-1 for urine and dung from grazing animals, experimental evidence shows that it may be up to five times lower, averaging 0.004 kg N2O-N kg-1 [37].
### Methane emissions
The enteric CH4 intensity was similar between different scenarios (Fig 2), showing the greatest sensitivity index, with values ranging from 0.53 to 0.62, which indicate that for a 10% change in this source, the C footprint may change between 5.3 and 6.2% (Fig 3). The large effect of enteric CH4 emissions on the whole C footprint was expected, because the impact of enteric CH4 on GHG emissions of milk production in different dairy systems has been estimated to range from 44 to 60% of the total CO2e [50,52,57,58]. However, emissions in feed production may be the most important source of GHG when emission factors for producing concentrate feeds are greater than 0.7 kg CO2e kg-1 [59], which did not happen in this study.
The lack of difference in enteric CH4 emissions in different systems can be explained by the narrow range of NDF content in diets (<4% difference). This non-difference is due to the lower NDF content of annual temperate pastures (495 g (kg DM)-1) compared to corn silage (550 g (kg DM)-1). Hence, an expected, increase NDF content with decreased concentrate was partially offset by an increase in the pasture proportion relatively low in NDF. This is in agreement with studies conducted in southern Brazil, which have shown that the actual enteric CH4 emissions may decrease with inclusion of temperate pastures in cows receiving corn silage and soybean meal [60] or increase enteric CH4 emissions when dairy cows grazing a temperate pasture was supplemented with corn silage [61]. Additionally, enteric CH4 emissions did not differ between dairy cows receiving TMR exclusively or grazing a tropical pasture in the same scenarios as in this study [26].
### Emissions from excreta and feed production
Using IPCC emission factors for N2O emissions from urine and dung [38] and those from Table 3, CH4 emissions from manure decreased 0.07 kg CO2e (kg ECM)-1, but N2O emissions from manure increased 0.09 kg CO2e (kg ECM)-1, as TMR intake was restricted to 50% ad libitum (Fig 4A). Emissions for pastures increased by 0.06 kg CO2e (kg ECM)-1, whereas emissions for producing concentrate feeds and corn silage decreased by 0.09 kg CO2e (kg ECM)-1, as TMR intake decreased (Fig 4B). In this situation, the lack of difference in calculated C footprints of different systems was also due to the greater emissions from manure, and offset by lower emissions from feed production with inclusion of pasture in lactating dairy cow diets. The greater N2O-N emissions from manure with pasture was a consequence of higher N2O-N emissions due to greater CP content and N urine excretion, as pasture intake increased. The effect of CP content on urine N excretion has been shown by several authors in lactating dairy cows [6264]. For instance, by decreasing CP content from 185 to 152 g (kg DM)-1, N intake decreased by 20% and urine N excretion by 60% [62]. In this study, the CP content for lactating dairy cows ranged from 150 g (kg DM)-1 on TMR system to 198 g (kg DM)-1 on 50% TMR with pasture. Additionally, greater urine N excretion is expected with greater use of pasture. This occurs because protein utilization in pastures is inefficient, as the protein in fresh forages is highly degradable in the rumen and may not be captured by microbes [65].
Using local emission factors for N2O emissions from urine and dung [37] and those from Table 4, reductions in CH4 emissions from stocked manure, when pastures were included on diets, did not offset by increases in N2O emissions from excreta (Fig 4C). In this case, total emissions from manure (Fig 4C) and feed production (Fig 4D) decreased with the inclusion of pasture. The impact of greater CP content and N urine excretion with increased pasture intake was offset by the much lower emission factors used for N2O emissions from urine and dung. As suggested by other authors [66,67], these results show that IPCC default value may need to be revised for the subtropical region.
Emissions for feed production decreased when pasture was included due to the greater emission factor for corn grain production compared to pastures. Emissions from concentrate and silage had at least twice the sensitivity index compared to emissions from pastures. The amount of grain required per cow in a lifetime decreased from 7,300 kg to 4,000 kg when 50% of TMR was replaced by pasture access. These results are in agreement with other studies which found lower C footprint, as concentrate use is reduced and/or pasture is included [9,68,69]. Moreover, it has been demonstrated that in intensive dairy systems, after enteric fermentation, feed production is the second main contributor to C footprint [50]. There is potential to decrease the environmental impact of dairy systems by reducing the use of concentrate ingredients with high environmental impact, particularly in confinements [9].
### Assumptions and limitations
The milk production and composition data are the average for a typical herd, which might have great animal-to-animal variability. Likewise, DM yield of crops and pastures were collected from experimental observations, and may change as a function of inter-annual variation, climatic conditions, soil type, fertilization level etc. The emission factors for direct and indirect N2O emissions from urine and dung were alternatively estimated using local data, but more experiments are necessary to reduce the uncertainty. The CO2 emitted from lime and urea application was estimated from IPCC default values, which may not represent emissions in subtropical conditions. This LCA may be improved by reducing the uncertainty of factors for estimating emissions from excreta and feed production, including the C sequestration or emissions as a function of soil management.
### Further considerations
The potential for using pasture can reduce the C footprint because milk production kept pace with animal confinement. However, if milk production is to decrease with lower TMR intake and inclusion of pasture [19], the C footprint would be expected to increase. Lorenz et al. [22] showed that an increase in milk yield from 5,000 to 6,000 kg ECM reduced the C footprint by 0.12 kg CO2e (kg ECM)-1, whereas an increase from 10,000 to 11,000 kg ECM reduced the C footprint by only 0.06 kg CO2e (kg ECM)-1. Hence, the impact of increasing milk production on decreasing C footprint is not linear, and mitigation measures, such as breeding for increased genetic yield potential and increasing concentrate ratio in the diet, are potentially harmful for animals health and welfare [70]. For instance, increasing concentrate ratio potentially increases the occurrence of subclinical ketosis and foot lesions, and C footprint may increase by 0.03 kg CO2e (kg ECM)-1 in subclinical ketosis [71] and by 0.02 kg CO2e (kg ECM)-1 in case of foot lesions [72].
Grazing lands may also improve biodiversity [73]. Strategies such as zero tillage may increase stocks of soil C [74]. This study did not consider C sequestration during the growth of annual pastures, because it was assumed these grasses were planted with tillage, having a balance between C sequestration and C emissions [38]. Considering the C sequestration from no-tillage perennial pasture, the amount of C sequestration will more than compensates for C emitted. These results are in agreement with other authors who have shown that a reduction or elimination of soil tillage increases annual soil C sequestration in subtropical areas by 0.5 to 1.5 t ha-1 [75]. If 50% of tilled areas were under perennial grasslands, 1.0 t C ha-1 would be sequestered, further reducing the C footprint by 0.015 and 0.025 kg CO2e (kg ECM)-1 for the scenarios using 75 and 50% TMR, respectively. Eliminating tillage, the reduction on total GHG emissions would be 0.03 and 0.05 kg CO2e (kg ECM)-1 for 75 and 50% TMR, respectively. However, this approach may be controversial because lands which have been consistently managed for decades have approached steady state C storage, so that net exchange of CO2 would be negligible [76].
## Conclusions
This study assessed the C footprint of dairy cattle systems with or without access to pastures. Including pastures showed potential to maintain or decrease to a small extent the C footprint, which may be attributable to the evidence of low N2O emissions from urine and dung in dairy systems in subtropical areas. Even though the enteric CH4 intensity was the largest source of CO2e emissions, it did not change between different scenarios due to the narrow range of NDF content in diets and maintaining the same milk production with or without access to pastures.
## Tables
Table 1: Descriptive characteristics of the herd.
| Item | Unit | Average |
|-------------------------------|-----------|-----------|
| Milking cows | # | 165 |
| Milk production | kg year-1 | 7,015 |
| Milk fat | % | 4.0 |
| Milk protein | % | 3.3 |
| Length of lactation | days | 305 |
| Body weight | kg | 553 |
| Lactations per cow | # | 4 |
| Replacement rate | % | 25 |
| Cull rate | % | 25 |
| First artificial insemination | months | 16 |
| Weaned | days | 60 |
| Mortality | % | 3.0 |
Table 2: Dairy cows diets in different scenariosa.
| | Calf | Calf | Pregnant/dry | Pregnant/dry | Lactation | Lactation | Lactation | Weighted average | Weighted average | Weighted average |
|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|
| | 012 mo | 12-AI mo | Heifer | Cow | TMR | TMR75 | TMR50 | TMR | TMR75 | TMR50 |
| Days | 360 | 120 | 270 | 180 | 1220 | 1220 | 1220 | | | |
| DM intake, kg d-1 | 3.35 | 6.90 | 10.4 | 11.0 | 18.7 | 17.2 | 17.0 | 13.8 | 12.9 | 12.8 |
| Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 | Ingredients, g (kg DM)-1 |
| Ground corn | 309 | 145 | 96.3 | - | 257 | 195 | 142 | 218 | 183 | 153 |
| Soybean meal | 138 | 22 | 26.7 | - | 143 | 105 | 76.1 | 109 | 88.0 | 71.0 |
| Corn silage | 149 | 290 | 85.6 | - | 601 | 451 | 326 | 393 | 308 | 237 |
| Ann temperate pasture | 184 | 326 | 257 | - | - | 185 | 337 | 81.3 | 186 | 273 |
| Ann tropical pasture | - | - | 107 | - | - | 63 | 119 | 13.4 | 49.1 | 81.0 |
| Perenn tropical pasture | 219 | 217 | 428 | 1000 | - | - | - | 186 | 186 | 186 |
| Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 | Chemical composition, g (kg DM)-1 |
| Organic matter | 935 | 924 | 913 | 916 | 958 | 939 | 924 | 943 | 932 | 924 |
| Crude protein | 216 | 183 | 213 | 200 | 150 | 170 | 198 | 175 | 186 | 202 |
| Neutral detergent fibre | 299 | 479 | 518 | 625 | 382 | 418 | 449 | 411 | 431 | 449 |
| Acid detergent fibre | 127 | 203 | 234 | 306 | 152 | 171 | 187 | 174 | 185 | 194 |
| Ether extract | 46.5 | 30.4 | 28.6 | 25.0 | 31.8 | 31.1 | 30.4 | 33.2 | 32.8 | 32.4 |
| Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value | Nutritive value |
| OM digestibility, % | 82.1 | 77.9 | 77.1 | 71.9 | 72.4 | 75.0 | 77.2 | 74.8 | 76.3 | 77.6 |
| NEL, Mcal (kg DM)-1 | 1.96 | 1.69 | 1.63 | 1.44 | 1.81 | 1.78 | 1.74 | 1.8 | 1.8 | 1.7 |
| MP, g (kg DM)-1 | 111 | 93.6 | 97.6 | 90.0 | 95.0 | 102 | 102 | 97.5 | 102 | 101 |
Table 3: GHG emission factors for Off- and On-farm feed production.
| Feed | DM yield (kg ha-1) | Emission factor | Unita | References |
|------------------|----------------------|-------------------|----------------------|--------------|
| Off-farm | | | | |
| Corn grain | 7,500 | 0.316 | kg CO2e (kg grain)-1 | [30] |
| Soybean | 2,200 | 0.186 | kg CO2e (kg grain)-1 | [31] |
| On-farm | | | | |
| Corn silageb | 16,000 | 0.206 | kg CO2e (kg DM)-1 | [32,33] |
| Annual ryegrassc | 9,500 | 0.226 | kg CO2e (kg DM)-1 | [32,34] |
| Pearl milletd | 11,000 | 0.195 | kg CO2e (kg DM)-1 | [32,35] |
| Kikuyu grasse | 9,500 | 0.226 | kg CO2e (kg DM)-1 | [32,36] |
Table 4: GHG emissions from On-farm feed production.
| Item | Corn silage | Annual temperate pasture | Annual tropical pasture | Perennial tropical pasture |
|-------------------------------------------|---------------|----------------------------|---------------------------|------------------------------|
| DM yield, kg ha-1 | 16000 | 9500 | 11000 | 9500 |
| Direct N2O emissions to air | | | | |
| N organic fertilizer, kg ha-1a | 150 | 180 | 225 | 225 |
| N synthetic fertilizer | - | 20 | 25 | 25 |
| N from residual DM, kg ha-1b | 70 | 112 | 129 | 112 |
| Emission fator, kg N2O-N (kg N)-1c | 0.002 | 0.002 | 0.002 | 0.002 |
| kg N2O ha-1 from direct emissions | 0.69 | 0.98 | 1.19 | 1.14 |
| Indirect N2O emissions to air | | | | |
| kg NH3-N+NOx-N (kg organic N)-1b | 0.2 | 0.2 | 0.2 | 0.2 |
| kg NH3-N+NOx-N (kg synthetic N)-1b | 0.1 | 0.1 | 0.1 | 0.1 |
| kg N2O-N (kg NH3-N+NOx-N)-1b | 0.01 | 0.01 | 0.01 | 0.01 |
| kg N2O ha-1 from NH3+NOx volatilized | 0.47 | 0.60 | 0.75 | 0.75 |
| Indirect N2O emissions to soil | | | | |
| kg N losses by leaching (kg N)-1b | 0.3 | 0.3 | 0.3 | 0.3 |
| kg N2O-N (kg N leaching)-1 | 0.0075 | 0.0075 | 0.0075 | 0.0075 |
| kg N2O ha-1 from N losses by leaching | 0.78 | 1.10 | 1.34 | 1.28 |
| kg N2O ha-1 (direct + indirect emissions) | 1.94 | 2.68 | 3.28 | 3.16 |
| kg CO2e ha-1 from N20 emissionsd | 514 | 710 | 869 | 838 |
| kg CO2 ha-1 from lime+ureab | 515 | 721 | 882 | 852 |
| kg CO2 ha-1 from diesel combustione | 802 | 38 | 23 | 12 |
| kg CO2e from secondary sourcesf | 516 | 205 | 225 | 284 |
| Total CO2e emitted, kg ha-1 | 1833 | 964 | 1130 | 1148 |
| Emission factor, kg CO2e (kg DM)-1g | 0.115 | 0.145 | 0.147 | 0.173 |
| Carbon sequestered, kg ha-1h | - | - | - | 570 |
| Sequestered CO2-C, kg ha-1 | - | - | - | 1393 |
| kg CO2e ha-1 (emitted—sequestered) | 1833 | 964 | 1130 | -245 |
| Emission factor, kg CO2e (kg DM)-1i | 0.115 | 0.145 | 0.147 | -0.037 |
Table 5: Factors for major resource inputs in farm management.
| Item | Factor | Unita | References |
|------------------------------------------|----------|-------------------|--------------|
| Production and transport of diesel | 0.374 | kg CO2e L-1 | [41] |
| Emissions from diesel fuel combustion | 2.637 | kg CO2e L-1 | [41] |
| Production of electricityb | 0.73 | kg CO2e kWh-1 | [41] |
| Production of electricity (alternative)c | 0.205 | kg CO2e kWh-1 | [46] |
| Production of machinery | 3.54 | kg CO2e (kg mm)-1 | [42] |
| Manure handling | | | |
| Fuel for manure handling | 0.600 | L diesel tonne-1 | [42] |
| Machinery for manure handling | 0.17 | kg mm kg-1 | [42] |
| Milking and confinement | | | |
| Electricity for milking | 0.06 | kWh (kg milk)-1 | [47] |
| Electricity for lightingd | 75 | kWh cow-1 | [47] |
## Figures
Fig 1: Overview of the milk production system boundary considered in the study.
<!-- image -->
Fig 2: Overall greenhouse gas emissions in dairy cattle systems under various scenarios.
TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting for sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
<!-- image -->
Fig 3: Sensitivity of the C footprint.
Sensitivity index = percentage change in C footprint for a 10% change in the given emission source divided by 10% of. (a) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.73 kg CO2e kWh-1 [41]. (b) N2O emission factors for urine and dung from IPCC [38], feed production emission factors from Table 3, production of electricity = 0.205 kg CO2e kWh-1 [46]; (c) N2O emission factors for urine and dung from local data [37], feed production EF from Table 4 without accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46]. (d) N2O emission factors for urine and dung from local data [37], feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture, production of electricity = 0.205 kg CO2e kWh-1 [46].
<!-- image -->
Fig 4: Greenhouse gas emissions (GHG) from manure and feed production in dairy cattle systems.
TMR = ad libitum TMR intake, 75TMR = 75% of ad libitum TMR intake with access to pasture, 50TMR = 50% of ad libitum TMR intake with access to pasture. (a) N2O emission factors for urine and dung from IPCC [38]. (b) Feed production emission factors from Table 3. (c) N2O emission factors for urine and dung from local data [37]. (d) Feed production emission factors from Table 4 accounting sequestered CO2-C from perennial pasture.
<!-- image -->
## References
- Climate Change and Land. Chapter 5: Food Security (2019)
- Herrero M; Henderson B; Havlík P; Thornton PK; Conant RT; Smith P. Greenhouse gas mitigation potentials in the livestock sector. Nat Clim Chang (2016)
- Rivera-Ferre MG; López-i-Gelats F; Howden M; Smith P; Morton JF; Herrero M. Re-framing the climate change debate in the livestock sector: mitigation and adaptation options. Wiley Interdiscip Rev Clim Chang (2016)
- van Zanten HHE; Mollenhorst H; Klootwijk CW; van Middelaar CE; de Boer IJM. Global food supply: land use efficiency of livestock systems. Int J Life Cycle Assess (2016)
- Hristov AN; Oh J; Firkins L; Dijkstra J; Kebreab E; Waghorn G. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: I. A review of enteric methane mitigation options. J Anim Sci (2013)
- Hristov AN; Ott T; Tricarico J; Rotz A; Waghorn G; Adesogan A. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: III. A review of animal management mitigation options. J Anim Sci (2013)
- Montes F; Meinen R; Dell C; Rotz A; Hristov AN; Oh J. SPECIAL TOPICS—Mitigation of methane and nitrous oxide emissions from animal operations: II. A review of manure management mitigation options. J Anim Sci (2013)
- Ledgard SF; Wei S; Wang X; Falconer S; Zhang N; Zhang X. Nitrogen and carbon footprints of dairy farm systems in China and New Zealand, as influenced by productivity, feed sources and mitigations. Agric Water Manag (2019)
- OBrien D; Shalloo L; Patton J; Buckley F; Grainger C; Wallace M. A life cycle assessment of seasonal grass-based and confinement dairy farms. Agric Syst (2012)
- Salou T; Le Mouël C; van der Werf HMG. Environmental impacts of dairy system intensification: the functional unit matters!. J Clean Prod (2017)
- Lizarralde C; Picasso V; Rotz CA; Cadenazzi M; Astigarraga L. Practices to Reduce Milk Carbon Footprint on Grazing Dairy Farms in Southern Uruguay. Case Studies. Sustain Agric Res (2014)
- Clark CEF; Kaur R; Millapan LO; Golder HM; Thomson PC; Horadagoda A. The effect of temperate or tropical pasture grazing state and grain-based concentrate allocation on dairy cattle production and behavior. J Dairy Sci (2018)
- FAOSTAT. (2017)
- Vogeler I; Mackay A; Vibart R; Rendel J; Beautrais J; Dennis S. Effect of inter-annual variability in pasture growth and irrigation response on farm productivity and profitability based on biophysical and farm systems modelling. Sci Total Environ (2016)
- Wilkinson JM; Lee MRF; Rivero MJ; Chamberlain AT. Some challenges and opportunities for grazing dairy cows on temperate pastures. Grass Forage Sci. (2020)
- Wales WJ; Marett LC; Greenwood JS; Wright MM; Thornhill JB; Jacobs JL. Use of partial mixed rations in pasture-based dairying in temperate regions of Australia. Anim Prod Sci (2013)
- Bargo F; Muller LD; Delahoy JE; Cassidy TW. Performance of high producing dairy cows with three different feeding systems combining pasture and total mixed rations. J Dairy Sci (2002)
- Vibart RE; Fellner V; Burns JC; Huntington GB; Green JT. Performance of lactating dairy cows fed varying levels of total mixed ration and pasture. J Dairy Res (2008)
- Mendoza A; Cajarville C; Repetto JL. Short communication: Intake, milk production, and milk fatty acid profile of dairy cows fed diets combining fresh forage with a total mixed ration. J Dairy Sci (2016)
- Nutrient Requirements of Dairy Cattle (2001)
- Noizère P; Sauvant D; Delaby L. (2018)
- Lorenz H; Reinsch T; Hess S; Taube F. Is low-input dairy farming more climate friendly? A meta-analysis of the carbon footprints of different production systems. J Clean Prod (2019)
- INTERNATIONAL STANDARD—Environmental management—Life cycle assessment—Requirements and guidelines (2006)
- Environmental management—Life cycle assessment—Principles and framework. Iso 14040 (2006)
- FAO. Environmental Performance of Large Ruminant Supply Chains: Guidelines for assessment (2016)
- Civiero M; Ribeiro-Filho HMN; Schaitz LH. Pearl-millet grazing decreases daily methane emissions in dairy cows receiving total mixed ration. 7th Greenhouse Gas and Animal Agriculture Conference,. Foz do Iguaçu (2019)
- IPCC—Intergovernmental Panel on Climate Change. Climate Change 2014 Synthesis Report (Unedited Version). 2014. Available: ttps://.
- INRA. Alimentation des bovins, ovins et caprins. Besoins des animaux—valeurs des aliments. Tables Inra 2007. 4th ed. INRA, editor. 2007.
- Delagarde R; Faverdin P; Baratte C; Peyraud JL. GrazeIn: a model of herbage intake and milk production for grazing dairy cows. 2. Prediction of intake under rotational and continuously stocked grazing management. Grass Forage Sci (2011)
- Ma BL; Liang BC; Biswas DK; Morrison MJ; McLaughlin NB. The carbon footprint of maize production as affected by nitrogen fertilizer and maize-legume rotations. Nutr Cycl Agroecosystems (2012)
- Rauccci GS; Moreira CS; Alves PS; Mello FFC; Frazão LA; Cerri CEP. Greenhouse gas assessment of Brazilian soybean production: a case study of Mato Grosso State. J Clean Prod (2015)
- Camargo GGT; Ryan MR; Richard TL. Energy Use and Greenhouse Gas Emissions from Crop Production Using the Farm Energy Analysis Tool. Bioscience (2013)
- da Silva MSJ; Jobim CC; Poppi EC; Tres TT; Osmari MP. Production technology and quality of corn silage for feeding dairy cattle in Southern Brazil. Rev Bras Zootec (2015)
- Duchini PGPG Guzatti GCGC; Ribeiro-Filho HMNHMNN Sbrissia AFAFAF. Intercropping black oat (Avena strigosa) and annual ryegrass (Lolium multiflorum) can increase pasture leaf production compared with their monocultures. Crop Pasture Sci (2016)
- Scaravelli LFB; Pereira LET; Olivo CJ; Agnolin CA. Produção e qualidade de pastagens de Coastcross-1 e milheto utilizadas com vacas leiteiras. Cienc Rural (2007)
- Sbrissia AF; Duchini PG; Zanini GD; Santos GT; Padilha DA; Schmitt D. Defoliation strategies in pastures submitted to intermittent stocking method: Underlying mechanisms buffering forage accumulation over a range of grazing heights. Crop Sci (2018)
- Almeida JGR; Dall-Orsoletta AC; Oziemblowski MM; Michelon GM; Bayer C; Edouard N. Carbohydrate-rich supplements can improve nitrogen use efficiency and mitigate nitrogenous gas emissions from the excreta of dairy cows grazing temperate grass. Animal (2020)
- Eggleston H.S.; Buendia L.; Miwa K. IPCC guidlines for national greenhouse gas inventories. (2006)
- Ramalho B; Dieckow J; Barth G; Simon PL; Mangrich AS; Brevilieri RC. No-tillage and ryegrass grazing effects on stocks, stratification and lability of carbon and nitrogen in a subtropical Umbric Ferralsol. Eur J Soil Sci (2020)
- Fernandes HC; da Silveira JCM; Rinaldi PCN. Avaliação do custo energético de diferentes operações agrícolas mecanizadas. Cienc e Agrotecnologia (2008)
- Wang M Q. GREET 1.8a Spreadsheet Model. 2007. Available: .
- Rotz CAA; Montes F; Chianese DS; Chiane DS. The carbon footprint of dairy production systems through partial life cycle assessment. J Dairy Sci (2010)
- Niu M; Kebreab E; Hristov AN; Oh J; Arndt C; Bannink A. Prediction of enteric methane production, yield, and intensity in dairy cattle using an intercontinental database. Glob Chang Biol (2018)
- Eugène M; Sauvant D; Nozière P; Viallard D; Oueslati K; Lherm M. A new Tier 3 method to calculate methane emission inventory for ruminants. J Environ Manage (2019)
- Reed KF; Moraes LE; Casper DP; Kebreab E. Predicting nitrogen excretion from cattle. J Dairy Sci (2015)
- Barros MV; Piekarski CM; De Francisco AC. Carbon footprint of electricity generation in Brazil: An analysis of the 20162026 period. Energies (2018)
- Ludington D; Johnson E. Dairy Farm Energy Audit Summary. New York State Energy Res Dev Auth (2003)
- Thoma G; Jolliet O; Wang Y. A biophysical approach to allocation of life cycle environmental burdens for fluid milk supply chain analysis. Int Dairy J (2013)
- Naranjo A; Johnson A; Rossow H. Greenhouse gas, water, and land footprint per unit of production of the California dairy industry over 50 years. (2020)
- Jayasundara S; Worden D; Weersink A; Wright T; VanderZaag A; Gordon R. Improving farm profitability also reduces the carbon footprint of milk production in intensive dairy production systems. J Clean Prod (2019)
- Williams SRO; Fisher PD; Berrisford T; Moate PJ; Reynard K. Reducing methane on-farm by feeding diets high in fat may not always reduce life cycle greenhouse gas emissions. Int J Life Cycle Assess (2014)
- Gollnow S; Lundie S; Moore AD; McLaren J; van Buuren N; Stahle P. Carbon footprint of milk production from dairy cows in Australia. Int Dairy J (2014)
- OBrien D; Capper JL; Garnsworthy PC; Grainger C; Shalloo L. A case study of the carbon footprint of milk from high-performing confinement and grass-based dairy farms. J Dairy Sci (2014)
- Chobtang J; McLaren SJ; Ledgard SF; Donaghy DJ. Consequential Life Cycle Assessment of Pasture-based Milk Production: A Case Study in the Waikato Region, New Zealand. J Ind Ecol (2017)
- Garg MR; Phondba BT; Sherasia PL; Makkar HPS. Carbon footprint of milk production under smallholder dairying in Anand district of Western India: A cradle-to-farm gate life cycle assessment. Anim Prod Sci (2016)
- de Léis CM; Cherubini E; Ruviaro CF; Prudêncio da Silva V; do Nascimento Lampert V; Spies A. Carbon footprint of milk production in Brazil: a comparative case study. Int J Life Cycle Assess (2015)
- OBrien D; Geoghegan A; McNamara K; Shalloo L. How can grass-based dairy farmers reduce the carbon footprint of milk?. Anim Prod Sci (2016)
- OBrien D; Brennan P; Humphreys J; Ruane E; Shalloo L. An appraisal of carbon footprint of milk from commercial grass-based dairy farms in Ireland according to a certified life cycle assessment methodology. Int J Life Cycle Assess (2014)
- Baek CY; Lee KM; Park KH. Quantification and control of the greenhouse gas emissions from a dairy cow system. J Clean Prod (2014)
- Dall-Orsoletta AC; Almeida JGR; Carvalho PCF; Savian J V. Ribeiro-Filho HMN. Ryegrass pasture combined with partial total mixed ration reduces enteric methane emissions and maintains the performance of dairy cows during mid to late lactation. J Dairy Sci (2016)
- Dall-Orsoletta AC; Oziemblowski MM; Berndt A; Ribeiro-Filho HMN. Enteric methane emission from grazing dairy cows receiving corn silage or ground corn supplementation. Anim Feed Sci Technol (2019)
- Niu M; Appuhamy JADRN; Leytem AB; Dungan RS; Kebreab E. Effect of dietary crude protein and forage contents on enteric methane emissions and nitrogen excretion from dairy cows simultaneously. Anim Prod Sci (2016)
- Waghorn GC; Law N; Bryant M; Pacheco D; Dalley D. Digestion and nitrogen excretion by Holstein-Friesian cows in late lactation offered ryegrass-based pasture supplemented with fodder beet. Anim Prod Sci (2019)
- Dickhoefer U; Glowacki S; Gómez CA; Castro-Montoya JM. Forage and protein use efficiency in dairy cows grazing a mixed grass-legume pasture and supplemented with different levels of protein and starch. Livest Sci (2018)
- Schwab CG; Broderick GA. A 100-Year Review: Protein and amino acid nutrition in dairy cows. J Dairy Sci (2017)
- Sordi A; Dieckow J; Bayer C; Alburquerque MA; Piva JT; Zanatta JA. Nitrous oxide emission factors for urine and dung patches in a subtropical Brazilian pastureland. Agric Ecosyst Environ (2014)
- Simon PL; Dieckow J; de Klein CAM; Zanatta JA; van der Weerden TJ; Ramalho B. Nitrous oxide emission factors from cattle urine and dung, and dicyandiamide (DCD) as a mitigation strategy in subtropical pastures. Agric Ecosyst Environ (2018)
- Wang X; Ledgard S; Luo J; Guo Y; Zhao Z; Guo L. Environmental impacts and resource use of milk production on the North China Plain, based on life cycle assessment. Sci Total Environ (2018)
- Pirlo G; Lolli S. Environmental impact of milk production from samples of organic and conventional farms in Lombardy (Italy). J Clean Prod (2019)
- Herzog A; Winckler C; Zollitsch W. In pursuit of sustainability in dairy farming: A review of interdependent effects of animal welfare improvement and environmental impact mitigation. Agric Ecosyst Environ (2018)
- Mostert PF; van Middelaar CE; Bokkers EAM; de Boer IJM. The impact of subclinical ketosis in dairy cows on greenhouse gas emissions of milk production. J Clean Prod (2018)
- Mostert PF; van Middelaar CE; de Boer IJM; Bokkers EAM. The impact of foot lesions in dairy cows on greenhouse gas emissions of milk production. Agric Syst (2018)
- Foley JA; Ramankutty N; Brauman KA; Cassidy ES; Gerber JS; Johnston M. Solutions for a cultivated planet. Nature (2011)
- Lal R.. Soil Carbon Sequestration Impacts on Global Climate Change and Food Security. Science (80-) (2004)
- Boddey RM; Jantalia CP; Conceiçao PC; Zanatta JA; Bayer C; Mielniczuk J. Carbon accumulation at depth in Ferralsols under zero-till subtropical agriculture. Glob Chang Biol (2010)
- McConkey B; Angers D; Bentham M; Boehm M; Brierley T; Cerkowniak D. Canadian agricultural greenhouse gas monitoring accounting and reporting system: methodology and greenhouse gas estimates for agricultural land in the LULUCF sector for NIR 2014. (2014)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,74 @@
import json
import logging
import os
from io import BytesIO
from pathlib import Path
from docling_core.types.doc import DoclingDocument
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
GENERATE = False
def get_pubmed_paths():
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
xml_files = sorted(directory.rglob("*.xml"))
return xml_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XML_PUBMED])
return converter
def verify_export(pred_text: str, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
fw.write(pred_text)
return True
else:
with open(gtfile, "r") as fr:
true_text = fr.read()
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
return pred_text == true_text
def test_e2e_pubmed_conversions(use_stream=False):
pubmed_paths = get_pubmed_paths()
converter = get_converter()
for pubmed_path in pubmed_paths:
gt_path = (
pubmed_path.parent.parent / "groundtruth" / "docling_v2" / pubmed_path.name
)
if use_stream:
buf = BytesIO(pubmed_path.open("rb").read())
stream = DocumentStream(name=pubmed_path.name, stream=buf)
conv_result: ConversionResult = converter.convert(stream)
else:
conv_result: ConversionResult = converter.convert(pubmed_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
def test_e2e_pubmed_conversions_stream():
test_e2e_pubmed_conversions(use_stream=True)
def test_e2e_pubmed_conversions_no_stream():
test_e2e_pubmed_conversions(use_stream=False)

View File

@ -86,6 +86,25 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/uspto/pftaps057006474.txt") doc_path = Path("./tests/data/uspto/pftaps057006474.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_USPTO assert dci._guess_format(doc_path) == InputFormat.XML_USPTO
# Valid XML PubMed
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.xml").open("rb").read())
stream = DocumentStream(name="elife-56337.xml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.xml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.nxml").open("rb").read())
stream = DocumentStream(name="elife-56337.nxml", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.nxml")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
buf = BytesIO(Path("./tests/data/pubmed/elife-56337.txt").open("rb").read())
stream = DocumentStream(name="elife-56337.txt", stream=buf)
assert dci._guess_format(stream) == InputFormat.XML_PUBMED
doc_path = Path("./tests/data/pubmed/elife-56337.txt")
assert dci._guess_format(doc_path) == InputFormat.XML_PUBMED
# Valid XML, non-supported flavor # Valid XML, non-supported flavor
xml_content = ( xml_content = (
'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM ' '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE docling_test SYSTEM '